# Recommendation

In [None]:
from pandas import read_csv, DataFrame, Series

In [None]:
# import the data (take longtime)
# retrive from here https://www.kaggle.com/datasets/stackoverflow/stacksample/data
raw_answers: DataFrame = read_csv('./data/Answers.csv', encoding='latin-1')
raw_questions: DataFrame = read_csv('./data/Questions.csv', encoding='latin-1')
raw_tags: DataFrame = read_csv('./data/Tags.csv', encoding='latin-1')

## Data cleaning and fitting

In [None]:
# creating copy before:
questions = raw_questions.copy()
answers = raw_answers.copy()
tags = raw_tags.copy()

In [None]:
# make a list of tags and link it to questions
tags = raw_tags.groupby('Id')['Tag'].apply(list).reset_index()
questions = raw_questions.join(tags['Tag'], on='Id')
answers = raw_answers.join(tags['Tag'], on='ParentId')
questions

**Answers_filtered** is a dataframe containing only answer with a tag more frequent than 0.99. 

In [None]:
count_tags = raw_tags['Tag'].value_counts()
limit = count_tags.quantile(0.999)
most_frequent_tags = count_tags[count_tags > limit]
answers_per_tags = answers['Tag'].explode().reset_index()
answers_per_tags = answers_per_tags[answers_per_tags['Tag'].isin(most_frequent_tags.keys())]
answers_per_tags = answers_per_tags.groupby('index')['Tag'].apply(list).reset_index()
answers_filtered = answers.loc[answers_per_tags['index']].drop('Tag', axis=1)
answers_filtered = answers_filtered.join(answers_per_tags['Tag']).dropna(subset='Tag')
answers_filtered

Now we filter most frequent User

In [None]:
most_frequent_users = answers_filtered['OwnerUserId'].value_counts()
limit = most_frequent_users.quantile(0.9) # selection 90% best active users for answers, 45752 rows
most_frequent_users = most_frequent_users[most_frequent_users > limit]
answers_filtered2 = answers_filtered.loc[answers['OwnerUserId'].isin(most_frequent_users.keys())]
answers_filtered2 = answers_filtered2.explode('Tag')
answers_filtered2

In [None]:
grouped = answers_filtered2.groupby(['OwnerUserId', 'Tag'])
count = grouped.size()
User_per_tag = count.unstack(fill_value=0)
User_per_tag

# apply a mean per line
User_per_tag_normed = User_per_tag.div(User_per_tag.sum(axis=1), axis=0).dropna(how='any')
User_per_tag_normed.reset_index()

In [None]:
User_per_tag_normed.iloc[0]

In [None]:
User_per_tag2 = count.reset_index(name='count')
for idx, row in User_per_tag2.iterrows():
	User_per_tag2.loc[idx, 'count'] = User_per_tag_normed.loc[User_per_tag2.loc[idx]['OwnerUserId'], User_per_tag2.loc[idx]['Tag']]

## Analyse

In [None]:
from pandas import read_csv, DataFrame
from sklearn.decomposition import NMF as NMF_sklearn
from numpy import dot
from surprise import Dataset, SVD, KNNBasic, NMF
from surprise.reader import Reader
from surprise.model_selection import train_test_split, cross_validate

### First Recommendation Using NMF

In [None]:
model = NMF_sklearn(n_components=20)
W = model.fit_transform(User_per_tag_normed)
nmf_res = DataFrame(dot(W, model.components_), columns=User_per_tag_normed.columns)
nmf_res

In [None]:
for l, x in nmf_res.iloc[0].to_dict().items():
	print(l,f"{x:.2f}")

### Second Using SVD and KNNBasic evaluation

In [None]:
reader = Reader(line_format='user item rating')
dataset = Dataset.load_from_df(User_per_tag2, reader)
cross_validate(SVD(), dataset, measures=["RMSE", "MAE"], cv=5, verbose=True)
cross_validate(KNNBasic(), dataset, measures=["RMSE", "MAE"], cv=5, verbose=True)
cross_validate(NMF(), dataset, measures=["RMSE", "MAE"], cv=5, verbose=True)