# P5


In [1]:
import pandas as pd
import numpy as np

import pickle
import warnings
warnings.filterwarnings('ignore')

In [2]:

docs = pd.read_csv("QueryResults.csv")

In [3]:
docs.set_index('Id').sort_index()

Unnamed: 0_level_0,Title,Body,Tags,Score,ViewCount,FavoriteCount,AnswerCount
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
4,How to convert a Decimal to a Double in C#?,<p>I want to use a <code>Track-Bar</code> to c...,<c#><floating-point><type-conversion><double><...,759,64063,58,12
11,Calculate relative time in C#,<p>Given a specific <code>DateTime</code> valu...,<c#><datetime><time><datediff><relative-time-s...,1612,188799,552,41
13,Determine a user's timezone,<p>Is there a standard way for a web server to...,<html><browser><timezone><user-agent><timezone...,673,248432,157,27
19,What is the fastest way to get the value of π?,<p>I'm looking for the fastest way to obtain t...,<performance><algorithm><language-agnostic><un...,341,63943,86,23
88,Is gettimeofday() guaranteed to be of microsec...,"<p>I am porting a game, that was originally wr...",<linux><winapi><visual-c++><unix><timer>,104,43767,19,10
...,...,...,...,...,...,...,...
69729326,Endless sine generation in C,<p>I am working on a project which incorporate...,<c><performance><time><precision><trigonometry>,91,10074,25,12
69832748,"Error ""Error: A <Route> is only ever to be use...",<p>I am trying to use routing for the first ti...,<javascript><node.js><reactjs><frameworks><rea...,57,77406,13,14
69875125,find_element_by_* commands are deprecated in s...,<p>When starting the function</p>\n<pre><code>...,<python><selenium><selenium-webdriver><webdriv...,52,59346,25,3
70358643,"""You are running create-react-app 4.0.3 which ...",<p>I got an error while creating a React appli...,<javascript><reactjs><npm-install><yarnpkg><npx>,167,76256,35,6


In [4]:
docs.drop(['Id', 'Score', 'ViewCount', 'FavoriteCount','AnswerCount'], axis=1, inplace=True)

## Data Preparation


### Tags preprocessing

In [6]:
# tokenize the tags
def tokenizeTags(tags):
    """
    Remove some unwanted characters
    """
    tags = tags.replace('<', ' ')
    tags = tags.replace('>', '')
    return tags.strip().split(' ')

docs['Tags'] = docs['Tags'].apply(tokenizeTags)

In [7]:
# one doc tags sample
docs['Tags'][0]

['javascript', 'geolocation', 'projection', 'processing.js', 'proj4js']

In [20]:
# get 200 most frequent tags
all_tags = [item for sublist in docs['Tags'] for item in sublist]
unique, counts = np.unique(all_tags, return_counts=True)
pd.DataFrame(unique, counts).sort_index(ascending=False)[:200]


Unnamed: 0,0
3054,java
3025,python
2695,javascript
2644,c#
2391,ios
...,...
99,architecture
99,apache-spark-sql
98,encryption
98,configuration


In [None]:

# multi binarize the tags
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
mlb.fit([all_tags])

print(mlb.classes_)

# Test
# one doc tags sample
print(docs['Tags'][0])
tags_mlb = mlb.transform([docs['Tags'][0].split()])
print(tags_mlb)

# previous multiclass binarizer is very huge, instead filter directly where indexes equals '1'
for tag_index in np.where(tags_mlb == 1 )[1]:
    print(f'{tag_index} : {mlb.classes_[tag_index]}')
    

In [None]:
tags_count = pd.DataFrame({'tags': mlb.classes_})
tags_count['count'] = ""
tags_count

In [None]:
for i, tag in enumerate(tags_count['tags']):
    tags_count['count'].iloc[i] = all_tags.count(tag)

In [None]:
tags_count.sort_values('count', ascending=False)

In [None]:
import matplotlib.pyplot as plt

tags_count.sort_values('count', ascending=False)['count'].hist()


In [None]:
tags_count.describe()

In [None]:
# select only the tags that appear more than 100 times, thus reducing the number of tags to approx. 200
# this produces our vocabulary for tagging
currated_tags = tags_count[tags_count['count'] >= 100]['tags'].to_list()
print(len(currated_tags))
currated_tags

### Word Cloud

In [None]:
from wordcloud import WordCloud

long_string = ','.join(docs['Tags'])
# Create a WordCloud object
wordcloud = WordCloud(background_color="white", max_words=1000, contour_width=3, contour_color='steelblue')

# Generate a word cloud
wordcloud.generate(long_string)

# Visualize the word cloud
wordcloud.to_image()

### Words preprocessing

In [None]:
# WIP
# how to remove unwanted chars in the question body ?
import re

zero = docs.iloc[0]
re.sub('(<([^>]+)>)', '', zero.Body).replace('\n', '')


#### Remove code snippets

In [None]:
docs['Body'] = docs['Body'].apply(lambda d: d.replace('&lt;', '<').replace('&gt;', '>'))

In [None]:
from bs4 import BeautifulSoup

def remove_code_snippets(docs):
    """
    Remove code snippets from docs
    surrounded by <pre> tags
    """
    # ensure '<' & '>' chars are set, not the unicode char
    docs = docs.apply(lambda d: d.replace('&lt;', '<').replace('&gt;', '>'))
    ret = []
    for doc in docs:
        soup = BeautifulSoup(doc, 'html.parser')
        removals = soup.find_all('pre')
        for pre in removals:
            pre.decompose()
        ret.append(soup.get_text())
    return ret


In [None]:
docs['Body'] = remove_code_snippets(docs['Body'])

In [None]:
# words are the documents to use for the model
# words = docs['Body'].to_list()
words = docs['Body']

#### Remove Stop words, Lemmatize, Stemmize

In [None]:
# remove stop words
# lemmatize (convert and remove prefixes and suffixes to ignore the gender, plural, verb : walking, walks, walked, walk -> walk)
# and/or stemmize (take the same root of words : universe / university -> univers)

from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))
stopwords.update(['p', 'gt', 'lt', 'li', 'ul', 'img', 'src', 'td', 'tr', 'table', 'div', 'code'])
filtre_stopw =  lambda text: [token.lower() for token in text if token.lower() not in stopwords]

from nltk.stem.snowball import EnglishStemmer
stemmer = EnglishStemmer()

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

import nltk
tokenizer = nltk.RegexpTokenizer(r'\w+')


# tokenize and remove stop words
def tokenize_sw_stem(doc):
	# nltk.word_tokenize(doc)
    filtered = filtre_stopw(tokenizer.tokenize(doc))
    ret = " ".join([stemmer.stem(s) for s in filtered] )
    return ret

def tokenize_sw_lem(doc):
    # nltk.word_tokenize(doc)
    filtered = filtre_stopw(tokenizer.tokenize(doc))
    ret = " ".join([lemmatizer.lemmatize(s) for s in filtered] )
    return ret


words_stem = list(map(tokenize_sw_stem, words))
words_lem = list(map(tokenize_sw_lem, words))
# words

In [None]:
words[543]

In [None]:
words_stem[543]

In [None]:
words_lem[543]

In [None]:
# let's choose the lemmatize version of the corpus, since it results in real words that could be used as tags
docs['currated_body'] = words_lem

In [None]:
docs[['Body', 'currated_body']]

#### Remove bad words


In [None]:

re.sub(r'[0-9]+', '', words_lem[4733])
# words tha contains underscores
re.sub(r'_+', '', words_lem[4733])
# words_lem[0]

In [None]:
import re

def remove_words(words):
    ret = []
    for word in words:
        # print(f'\nprocessing sentence: {word}\n')
        flat = re.sub(r'[0-9]+', '', word) # words that are numbers
        flat = re.sub(r'_+', '', flat) # words tha contains underscores
        flat = re.sub(r'\w{15,}', '', flat) # words longer than 15 chars are most probably code (i.e. myClass.myMethodName())
        flat = " ".join(flat.split()) # remove double spaces
        # print(f'\ncurrated : {flat}')
        ret.append(flat)
    return ret


# re.sub(r'[0-9]*', '', words_lem[6543])
# re.sub(r'_+', '', words_lem[6543])
# currated = list(map(lambda x : re.sub(r'[0-9]*', '', x), words_lem))


In [None]:
currated = remove_words(words_lem)


In [None]:
currated[2649]


In [None]:
words_lem[2649]

In [None]:
docs['currated_body'] = currated

In [None]:
docs[['Body', 'currated_body']]

#### MultiLabel Binarizer

In [None]:
# savoir quel est le nombre de tags max
docs['tags_count'] = docs['Tags'].apply(lambda x : len(str(x).split())) # --> chaque docs à toujours 5 tags
# faire une colonne du DF pour chaque tags
docs[['tag0', 'tag1', 'tag2', 'tag3', 'tag4']] = docs['Tags'].str.split(expand=True)
# donner les colonnes au mlb


In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()

def mlb_tags(docs):
    docs[['tag0', 'tag1', 'tag2', 'tag3', 'tag4']] = docs['Tags'].str.split(expand=True)
    appended_tags = docs['Tags'].str.split(expand=True).stack()
    light_tag_list = appended_tags.value_counts()[:200]
    # fit the mlb with common tags (200)
    mlb.fit([light_tag_list.index])
    print(mlb.classes_.shape)
    docs['mlb_tags'] = mlb.transform(docs[['tag0', 'tag1', 'tag2', 'tag3', 'tag4']].values).tolist()



In [None]:
# docs.drop('mlb_tags', axis=1, inplace=True)
# docs.drop(['tag0', 'tag1', 'tag2', 'tag3', 'tag4'], axis=1, inplace=True)

In [None]:
mlb_tags(docs)

#### Corpus size

In [None]:
# count the total number of words in the overall corpus
# needs to be limited to 2000 / 3000 words maximum
lem = map(lambda x : x.split(), docs['currated_body'])
flat_list = [word for sublist in list(lem) for word in sublist ]


In [None]:

# if assertion fails, means we need to reduce the number of words 
# -> add the number of stop words
unique_words = np.unique(flat_list)
print(unique_words.shape[0])


## Train, Test Sets

In [None]:
from sklearn.model_selection import train_test_split

X = docs.drop(columns='mlb_tags', axis=1)
y = docs['mlb_tags']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

## Tf-Idf BOW

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer


vectorizer = TfidfVectorizer(max_features=3000)
X = vectorizer.fit_transform(docs['currated_body'])
vectorizer.get_feature_names_out()
print(X.shape)
# X = pd.DataFrame(X.T.todense())


In [None]:
with open('vectorizer', 'wb') as v :
    pickle.dump(vectorizer, v)


In [None]:
len(vectorizer.vocabulary_)

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

def predict_tags(doc: str) -> list[str]:  
    tf1_new = TfidfVectorizer(analyzer='word', ngram_range=(1,2), stop_words = "english", lowercase = True,
                            max_features = 3000, vocabulary = vectorizer.vocabulary_)
    T = tf1_new.fit_transform(np.array([doc]))
    T = pd.DataFrame(T.T.todense())
    T[0].sort_values(ascending=False)
    # take the 5 most interesting keywords
    indexes = T[0].sort_values(ascending=False)[:5]
    # print(indexes)
    ret = []
    for i, val in indexes.items():
        # print(i)
        # print(indexes[0].index[i])
        if (val > 0): 
            ret.append(vectorizer.get_feature_names_out()[i])
            # print(vectorizer.get_feature_names_out()[i])
    return ret

In [None]:
# X = vectorizer.fit_transform(docs['currated_body'])

feature_names = np.array(vectorizer.get_feature_names())

def get_top_tf_idf_words(response, top_n=2):
    sorted_nzs = np.argsort(response.data)[:-(top_n+1):-1]
    return feature_names[response.indices[sorted_nzs]]

docs['tfidf_tags'] = [get_top_tf_idf_words(X, 5) for X in X]


In [None]:
# docs['Tags'].str.split(expand=True)
docs['Tags']

In [None]:
docs['tfidf_tags'].to_list

In [None]:

# [re.sub(r',', r' ', s) for s in str(docs['tfidf_tags'])]
docs['tfidf_tags'][0]


#### Multi-label scoring

<https://medium.datadriveninvestor.com/predicting-tags-for-the-questions-in-stack-overflow-29438367261e>



In [None]:
# docs['predicted_tags'] = docs['currated_body'].apply(lambda row: predict_tags(row))
docs['predicted_tags_idf_mlb'] = [mlb.transform(tags) for tags in docs['tfidf_tags']] 



# mlb.transform([predict_tags(docs['currated_body'].iloc[789])])

In [None]:
index = 7541
X = words_lem[index]
y = docs['Tags'][index]
predict = predict_tags(np.array(words_lem[index]))
print(y.split())
# print(X)
print(predict)

# count number of label predicted found from expected
count = 0
for tag in predict:
    if tag in y.split():
        # print(f'found tag correctly predicted: {tag}')
        count += 1
print(count)



#  compute the confusion matrix score for one prediction

In [None]:
# compute multilabel binarizer for tags scoring
from sklearn.metrics import multilabel_confusion_matrix

y_true = mlb.transform(y.split())
y_pred = mlb.transform(predict)

print(y_true.shape)
print(y_pred.shape)

# confusion matrix MCM
# MCM (0,0): TN
# MCM (1,0): FN
# MCM (1,1): TP
# MCM (0,1): FP
confusion_matrix = multilabel_confusion_matrix(y_true, y_pred)
confusion_matrix[1602]

np.where(mlb.classes_ == 'download')[0]


#### Prediction

In [None]:
# sample code to predict tags

import pickle


tf1_new = TfidfVectorizer(analyzer='word', ngram_range=(1,2), stop_words = "english", lowercase = True,
                          max_features = 5000, vocabulary = vectorizer.vocabulary_)
T = tf1_new.fit_transform(np.array(["configuring java for accessing database with jdbc is not working with JPA after upgrade of spring 5.1. despite having modified application.properties"]))
T = pd.DataFrame(T.T.todense())

In [None]:
# T
T[0].sort_values(ascending=False)

In [None]:
# take the 3 most interesting keywords
indexes = T[0].sort_values(ascending=False)[:5]


In [None]:
print(indexes)
for i, val in indexes.items():
    # print(i)
    # print(indexes[0].index[i])
    if (val > 0): 
        print(vectorizer.get_feature_names_out()[i])

In [None]:
# random index for testing
doc_n = 543
print(words[doc_n])
sorted = X[doc_n].sort_values(ascending=False)[:15]
print(sorted)
print(vectorizer.get_feature_names_out()[sorted.index[0]]) #take the higher tf-idf value in the list, and get its corresponding word
print(vectorizer.get_feature_names_out()[sorted.index[1]]) 
print(vectorizer.get_feature_names_out()[sorted.index[3]]) 

In [None]:
docs['Tags'][543]

## Word2Vect

In [None]:
# snippet taken from sample notebook
import gensim
import multiprocessing

w2v_epochs=100
maxlen = 24 # adapt to length of sentences

print("Build & train Word2Vec model ...")
w2v_model = gensim.models.Word2Vec(min_count=5, window=5,
                                                vector_size=300,
                                                seed=42,
                                                # workers=30)
                                               workers=multiprocessing.cpu_count())
w2v_model.build_vocab(words_lem)
w2v_model.train(words, total_examples=w2v_model.corpus_count, epochs=w2v_epochs)
model_vectors = w2v_model.wv
w2v_words = model_vectors.index_to_key
print("Vocabulary size: %i" % len(w2v_words))
print("Word2Vec trained")


In [None]:
with open('w2v_model', 'wb') as v :
    # pickle.dump(w2v_model, v)
    w2v_model.save(v)


In [None]:
# word2vect docs and tutorial at https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html#sphx-glr-auto-examples-tutorials-run-word2vec-py
for index, word in enumerate(w2v_words):
    if index == 10:
        break
    print(f"word #{index}/{len(w2v_words)} is {word}")

In [None]:
## Scoring

In [None]:
import sklearn.metrics as metrics

#  from https://www.kaggle.com/code/michaelfumery/stackoverflow-questions-tag-generator/notebook?scriptVersionId=68023262

def metrics_score(model, df, y_true, y_pred):
    """Compilation function of metrics specific to multi-label
    classification problems in a Pandas DataFrame.
    This dataFrame will have 1 row per metric
    and 1 column per model tested. 

    Parameters
    ----------------------------------------
    model : string
        Name of the tested model
    df : DataFrame 
        DataFrame to extend. 
        If None : Create DataFrame.
    y_true : array
        Array of true values to test
    y_pred : array
        Array of predicted values to test
    ----------------------------------------
    """
    if(df is not None):
        temp_df = df
    else:
        temp_df = pd.DataFrame(index=["Accuracy", "F1",
                                      "Jaccard", "Recall",
                                      "Precision"],
                               columns=[model])
        
    scores = []
    scores.append(metrics.accuracy_score(y_true, 
                                         y_pred))
    scores.append(metrics.f1_score(y_pred, 
                                   y_true, 
                                   average='weighted'))
    scores.append(metrics.jaccard_score(y_true, 
                                        y_pred, 
                                        average='weighted'))
    scores.append(metrics.recall_score(y_true, 
                                       y_pred, 
                                       average='weighted'))
    scores.append(metrics.precision_score(y_true, 
                                          y_pred, 
                                          average='weighted'))
    temp_df[model] = scores
    
    return temp_df

à lire


topic modelling : unsupervised learning

topic classification : supervised learning


<https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/>

<https://www.baeldung.com/cs/ml-word2vec-topic-modeling>

<https://medium.com/le-blog-de-lapprentissage-automatique/pr%C3%A9diction-des-tags-des-questions-de-stack-overflow-9be00f7672f9>



In [None]:
docs['Tags'][543]

'javascript html angularjs angularjs-directive angularjs-ng-repeat'

## Word2Vect

In [75]:
# snippet taken from sample notebook
import gensim
import multiprocessing

w2v_epochs=100
maxlen = 24 # adapt to length of sentences

print("Build & train Word2Vec model ...")
w2v_model = gensim.models.Word2Vec(min_count=5, window=5,
                                                vector_size=300,
                                                seed=42,
                                                # workers=30)
                                               workers=multiprocessing.cpu_count())
w2v_model.build_vocab(words_lem)
w2v_model.train(words, total_examples=w2v_model.corpus_count, epochs=w2v_epochs)
model_vectors = w2v_model.wv
w2v_words = model_vectors.index_to_key
print("Vocabulary size: %i" % len(w2v_words))
print("Word2Vec trained")


Build & train Word2Vec model ...
Vocabulary size: 64
Word2Vec trained


In [18]:
with open('w2v_model', 'wb') as v :
    # pickle.dump(w2v_model, v)
    w2v_model.save(v)


In [19]:
# word2vect docs and tutorial at https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html#sphx-glr-auto-examples-tutorials-run-word2vec-py
for index, word in enumerate(w2v_words):
    if index == 10:
        break
    print(f"word #{index}/{len(w2v_words)} is {word}")

word #0/290 is  
word #1/290 is e
word #2/290 is t
word #3/290 is o
word #4/290 is a
word #5/290 is i
word #6/290 is n
word #7/290 is r
word #8/290 is s
word #9/290 is l


## Scoring

In [63]:
import sklearn.metrics as metrics

#  from https://www.kaggle.com/code/michaelfumery/stackoverflow-questions-tag-generator/notebook?scriptVersionId=68023262

def metrics_score(model, df, y_true, y_pred):
    """Compilation function of metrics specific to multi-label
    classification problems in a Pandas DataFrame.
    This dataFrame will have 1 row per metric
    and 1 column per model tested. 

    Parameters
    ----------------------------------------
    model : string
        Name of the tested model
    df : DataFrame 
        DataFrame to extend. 
        If None : Create DataFrame.
    y_true : array
        Array of true values to test
    y_pred : array
        Array of predicted values to test
    ----------------------------------------
    """
    if(df is not None):
        temp_df = df
    else:
        temp_df = pd.DataFrame(index=["Accuracy", "F1",
                                      "Jaccard", "Recall",
                                      "Precision"],
                               columns=[model])
        
    scores = []
    scores.append(metrics.accuracy_score(y_true, 
                                         y_pred))
    scores.append(metrics.f1_score(y_pred, 
                                   y_true, 
                                   average='weighted'))
    scores.append(metrics.jaccard_score(y_true, 
                                        y_pred, 
                                        average='weighted'))
    scores.append(metrics.recall_score(y_true, 
                                       y_pred, 
                                       average='weighted'))
    scores.append(metrics.precision_score(y_true, 
                                          y_pred, 
                                          average='weighted'))
    temp_df[model] = scores
    
    return temp_df

à lire


topic modelling : unsupervised learning

topic classification : supervised learning


<https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/>

<https://www.baeldung.com/cs/ml-word2vec-topic-modeling>

<https://medium.com/le-blog-de-lapprentissage-automatique/pr%C3%A9diction-des-tags-des-questions-de-stack-overflow-9be00f7672f9>

