## Working in Python 2

In [40]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm

## Building on Yang's Work

In [2]:
DATA_PATH = '../data'

In [3]:
import pickle

In [4]:
df = None
with open(DATA_PATH+'/df_java.pkl', 'rb') as f:
    df = pickle.load(f)


In [5]:
df.head()

Unnamed: 0,content,is_java
0,"""Click"" event not getting triggered due to ""bl...",0
1,"""Command ""python setup.py egg_info"" failed wit...",0
2,"""End-of-central-directory signature not found""...",0
3,"""Initialization-on-demand holder idiom"" - Lazy...",1
4,"""ValueError: I/O operation on closed file"" whe...",0


## From text analytics with Python


In [6]:
# https://github.com/dipanjanS/text-analytics-with-python/blob/master/Chapter-4/normalization.py

import normalization

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

## Below, normalizes text in the following way:
- expands contractions
- lemmatizes text based on parts of speech
- Removes stopwords
- Ignore removing special characters as we are dealing with unicode chars
- Tokenizes text

In [8]:
corpus = [text for text in df.content]


In [9]:
corpus = normalization.normalize_corpus(corpus)

In [10]:
# example
corpus[42]

u'access kubernetes service account via google cloud api look like kubernetes manage multiple service account cluster https kubernetesiodocsuserguideserviceaccountshowever find good tutorial documentation access nondefault service account via google cloud platform python api https githubcomgooglecloudplatformgooglecloudpython documentation see access default application credential default service account google compute engine https cloudgooglecomcomputedocsaccesscreateenableserviceaccountsforinstances clientlib question access nondefault credential nondefault service account'

In [33]:
len(corpus),len(df)

(2382, 2382)

In [32]:
from sklearn.model_selection import train_test_split

In [34]:
X_train, X_test, Y_train, Y_test = train_test_split(corpus, df.is_java, test_size=0.30, random_state=123)

In [35]:
len(X_train), len(X_test), len(Y_train), len(Y_test)

(1667, 715, 1667, 715)

In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [36]:
tfv = TfidfVectorizer(min_df=3, strip_accents='unicode', analyzer='word',\
token_pattern=r'\w{1,}', ngram_range=(1,2), use_idf = 1, smooth_idf = 1, sublinear_tf = 1, stop_words='english')

text_train = tfv.fit_transform(X_train)
text_test = tfv.fit_transform(X_test)

In [37]:
text_train.shape

(1667, 4495)

## Reduce dimensions via SVD

In [52]:
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=50, random_state=0)
svd_tfidf = svd.fit_transform(text_train)

In [53]:
svd_tfidf.shape

(1667, 50)

## Naive Bayes

In [42]:
from sklearn.naive_bayes import GaussianNB


In [54]:
gnb = GaussianNB()
gnb.fit(svd_tfidf, Y_train)

GaussianNB(priors=None)

In [59]:
Y_pred = gnb.predict(svd.fit_transform(text_test))

In [60]:
len(Y_pred)

715

In [63]:
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

In [64]:
print("Recall: ")
print(recall_score(Y_test, Y_pred))

Recall: 
0.046511627907


In [65]:
print("Precision: ")
print(precision_score(Y_test, Y_pred))

Precision: 
0.105263157895
