In [20]:
!pip install textblob

Collecting textblob
  Downloading textblob-0.12.0-py2.py3-none-any.whl (631kB)
[K    100% |████████████████████████████████| 634kB 1.6MB/s ta 0:00:01
Installing collected packages: textblob
Successfully installed textblob-0.12.0


In [86]:
import pandas as pd
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from string import punctuation
from collections import Counter
import re
import numpy as np
import itertools
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from gensim import corpora, models
from textblob import TextBlob
import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm
from sklearn.model_selection import cross_val_score

In [115]:
DATA_PATH = '../data/processed/stack_ds_4_9_2017 .csv'
WHICH_TAG = 'java'
NUM_TAGS_CONSIDERED = 10

# get data

In [116]:
def get_topn_tags_transform(path,topn=NUM_TAGS_CONSIDERED,tag = 'java'):
    """
    read cleaned data and transform them into one tag per row
    1. get top n tags
    2. expand row
    3. to boolean
    4. aggregate by content
    """
    df = pd.read_csv(path,quotechar='|',sep=',',header=None)
    df.columns = ['title','body','tags']
    merged = [ title + ' ' + body for title, body in zip(df.title,df.body)]
    df_merged = pd.DataFrame({'content':merged,'tags':df.tags})
    df_merged.tags = df_merged.tags.apply(lambda x: x.replace('<','').split('>')[:-1])
    df_transfromed = pd.DataFrame(df_merged.tags.tolist(),index=df_merged.content).stack().reset_index()[['content',0]]
    df_transfromed.columns = ['content','tags']
    top_tags = Counter(df_transfromed.tags).most_common()[:topn]
    top_n_tags = [tag for tag, num in top_tags]
    df_filtered = df_transfromed[df_transfromed.tags.apply(lambda x: x in set(top_n_tags))]
    df_filtered.tags = [int(bool) for bool in df_filtered.tags == tag]
    df_filtered.columns = ['content','is_{}'.format(tag)]
    rslt = df_filtered.groupby('content')['is_{}'.format(tag)].agg(['sum']).reset_index()
    rslt.columns = ['content','is_{}'.format(tag)]
    return rslt, top_n_tags

In [117]:
df, top_n_tags = get_topn_tags_transform(DATA_PATH)
df.head()

Unnamed: 0,content,is_java
0,"""Click"" event not getting triggered due to ""bl...",0
1,"""Command ""python setup.py egg_info"" failed wit...",0
2,"""End-of-central-directory signature not found""...",0
3,"""Initialization-on-demand holder idiom"" - Lazy...",1
4,"""ValueError: I/O operation on closed file"" whe...",0


In [118]:
top_n_tags

['javascript',
 'java',
 'android',
 'php',
 'python',
 'c#',
 'html',
 'jquery',
 'ios',
 'css']

In [119]:
df.shape

(2382, 2)

# tokenize & LDA
- lowercase 
- topwords
- remove if not character
- stemming (seems not that good)

In [120]:
def cleaner(sentence):
    to_be_removed = set(stopwords.words('english'))
    words = [word.lower() for word in word_tokenize(re.sub("[^a-zA-Z]"," ",sentence)) if word.lower() not in to_be_removed]
    nouns =  TextBlob(' '.join([w for w in words])).noun_phrases
    final_sentence = ' '.join([n for n in nouns])
    return final_sentence

In [121]:
def simpleLDA(df,num_topics=10,passes=3):
    texts = df.content.apply(lambda x: cleaner(x).split(' '))
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    ldamodel = models.ldamodel.LdaModel(corpus, num_topics=num_topics, id2word = dictionary, passes=passes)
    dict_values = {i:[] for i in range(10)}
    for sample in tqdm(ldamodel.get_document_topics(corpus)):
        row = np.zeros(num_topics)
        for topic_id,value in sample:
            row[topic_id] = value
        for i,v in enumerate(row):
            dict_values[i].append(v)    
    
    return pd.concat((pd.DataFrame(dict_values),df.iloc[:,1]),axis=1)

In [122]:
%%time
simpleLDA(df)

100%|██████████| 2382/2382 [00:01<00:00, 1405.98it/s]


CPU times: user 15.9 s, sys: 16 ms, total: 15.9 s
Wall time: 15.9 s


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,is_java
0,0.000000,0.000000,0.159506,0.000000,0.000000,0.614078,0.213206,0.000000,0.000000,0.000000,0
1,0.304355,0.407491,0.000000,0.000000,0.000000,0.000000,0.254805,0.000000,0.000000,0.000000,0
2,0.114381,0.000000,0.000000,0.000000,0.000000,0.867012,0.000000,0.000000,0.000000,0.000000,0
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.202165,0.773588,1
4,0.000000,0.000000,0.000000,0.392734,0.315321,0.000000,0.000000,0.000000,0.267802,0.000000,0
5,0.010007,0.010003,0.010002,0.010004,0.010002,0.010003,0.010002,0.010003,0.010002,0.909972,0
6,0.244708,0.000000,0.000000,0.000000,0.000000,0.252276,0.000000,0.478012,0.000000,0.000000,0
7,0.016671,0.016671,0.016667,0.016668,0.016667,0.636244,0.016667,0.230408,0.016670,0.016668,0
8,0.000000,0.000000,0.430842,0.000000,0.000000,0.264600,0.000000,0.274117,0.000000,0.000000,0
9,0.000000,0.000000,0.785094,0.000000,0.000000,0.000000,0.000000,0.000000,0.181565,0.000000,0


In [123]:
df = _

# xgb starter (no validation set,no cv)

In [124]:
from sklearn.model_selection import StratifiedKFold,StratifiedShuffleSplit
import xgboost as xgb
from sklearn.metrics import accuracy_score,recall_score
from sklearn.linear_model import LogisticRegression

In [125]:
X = df.iloc[:,:-1].values
Y = df.iloc[:,-1].values
skf = StratifiedShuffleSplit(n_splits=1, random_state=123)
for train_i,test_i in skf.split(X,Y):
    x_train,y_train = X[train_i], Y[train_i]
    x_test,y_test = X[test_i],Y[test_i]

In [126]:
lr = LogisticRegression().fit(x_train, y_train)
pred = lr.predict(x_test)

In [127]:
recall_score(pred,y_test,average=None)

array([ 0.83263598,  0.        ])

In [136]:
gbm = xgb.XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.5)
cross_val_score(gbm,x_train,y_train,scoring='recall',cv=2)

array([ 0.10734463,  0.14124294])

In [138]:
pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0])

In [137]:
pred = gbm.fit(x_train,y_train).predict(x_test)
recall_score(pred,y_test)

0.25

# useless tfidf?

In [167]:
def simpleTfidf(df,target='python'):
    """
    reduce dimension for model traning
    """
    df.content = df.content.apply(cleaner)
    tfidf = TfidfVectorizer(min_df=0.001,max_df=0.90, max_features=None, tokenizer= lambda x: x.split(' '), ngram_range=(1,1))
    tfidf_trained = tfidf.fit_transform(list(df.content))
    df_tfidf = pd.DataFrame({'token':tfidf.get_feature_names(),'tfidf_value':tfidf.idf_})
    return df_tfidf