In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import nltk
from multiprocessing import Pool
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.manifold import TSNE
import re

In [3]:
Questions = pd.read_csv('pythonquestions/Questions.csv', encoding="ISO-8859-1")
Answers = pd.read_csv('pythonquestions/Answers.csv', encoding="ISO-8859-1")
Tags = pd.read_csv('pythonquestions/Tags.csv', encoding="ISO-8859-1")

FileNotFoundError: File b'pythonquestions/Answers.csv' does not exist

In [None]:
Questions.head()

In [None]:
Answers.head()

In [None]:
Tags.head()

In [None]:
print(Questions.isnull().sum())
print(Answers.isnull().sum())
print(Tags.isnull().sum())

### Interesting question 1

In [None]:
User_id_inQ = Questions['OwnerUserId'].unique()
User_id_inA = Answers['OwnerUserId'].unique()
User_id_inBoth=set(User_id_inQ).intersection(User_id_inA)

In [None]:
print(str(len(User_id_inQ)) + ' users posting questions')
print(str(len(User_id_inA)) + ' users posting answers')
print(str(len(User_id_inBoth)) + ' users posting both')

In [None]:
# reduce memory and computation
selected_ids = np.random.choice(range(Questions.shape[0]), 10000, replace=False)
sampleQ = Questions.loc[selected_ids, :]

In [None]:
def purify_string(html):
    # removes line breaks and tags
    return re.sub('(\r\n)+|\r+|\n+', " ", re.sub('<[^<]+?>', '', html))

In [None]:
sampleQbodytext = sampleQ.loc[:, 'Body'].apply(purify_string)

In [None]:
def combine_title_body(tnb):
    return tnb[0] + " " + tnb[1]

In [None]:
p = Pool(8)
sampleQtext = p.map(combine_title_body, zip(Questions['Title'], sampleQbodytext))
p.close()

In [None]:
sampleQtext[:2]

In [None]:
lem = WordNetLemmatizer()
def cond_tokenize(t):
    if t is None:
        return []
    else:
        return [lem.lemmatize(w.lower()) for w in word_tokenize(t)]

p = Pool(8)
tokens = list(p.imap(cond_tokenize, sampleQtext))
p.close()

In [None]:
pure_tokens = [" ".join(sent) for sent in tokens]

In [None]:
pure_tokens[:2]

In [None]:
vectorizer = TfidfVectorizer(max_features=2000, stop_words='english', ngram_range=[1, 1], sublinear_tf=True)
tfidf = vectorizer.fit_transform(pure_tokens)

In [None]:
idfs = pd.DataFrame([[v, k] for k, v in vectorizer.vocabulary_.items()], columns=['id', 'word']).sort_values('id')
idfs['idf'] = vectorizer.idf_
idfs.sort_values('idf').head(10)

In [None]:
tSNE = TSNE(n_components=500, perplexity=30, verbose=2, method='barnes_hut')
transformed = tSNE.fit_transform(tfidf.toarray())

In [None]:
transformed.shape

In [None]:
np.sum(tSNE.explained_variance_ratio_)

In [None]:
# calculate pairwise cosine distance
D = distance.pdist(transformed, 'cosine')

In [None]:
# hierarchical clustering - tree calculation
L = hierarchy.linkage(D)

In [None]:


# split clusters by criterion. Here 0.71 is used as the inconsistency criterion. Adjust the
# number to change cluster sizes
cls = hierarchy.fcluster(L, 0.71, criterion='inconsistent')



In [None]:


df_cls = pd.DataFrame({'Pos': selected_ids, 'Cluster': cls})
cnts = df_cls.groupby('Cluster').size().sort_values(ascending=False)
cnts.sort_values(ascending=False).head()



In [None]:
# add clusters to question data
bc = pd.concat([sample, df_cls.set_index('Pos')], axis=1)
bc.head()

In [None]:
# calculate cluster stats
stats = bc.groupby('Cluster')['Score'].describe().unstack()

In [None]:
stats.sort_values('count', ascending=False).head(10)

In [None]:
plt.figure(figsize=(12, 8))
plt.hlines([0], xmin=0, xmax=np.max(stats['count']) + 5, alpha=0.5)
plt.vlines([1], ymin=0, ymax=np.max(stats['mean']) + 50, alpha=0.5)
plt.scatter(stats['count'], stats['mean'], alpha=0.3)
plt.title("cluster mean score vs cluster size")
plt.xlabel("cluster size")
plt.ylabel("mean score")
plt.show()

In [None]:
bc.loc[bc['Cluster'] == cnts.index[0]][['Score', 'Title', 'Body']]

In [None]:
bc.loc[bc['Cluster'] == cnts.index[1]][['Score', 'Title', 'Body']]

In [None]:
bc.loc[bc['Cluster'] == cnts.index[0]][['Score', 'Title', 'Body']]