In [1]:
import pandas as pd
import numpy as np
import nltk

In [2]:
labeled = pd.read_excel('manual_labels.xlsx')
unlabeled = pd.read_excel('manual_unlabeled.xlsx')

In [3]:
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re
def clean_comment(text):
    wnl = WordNetLemmatizer()
    deacc = re.sub(r'\W',' ', text)
    tokens = word_tokenize(deacc)
    res = ''
    for t in tokens:
        res += wnl.lemmatize(t)+' '
    return res

In [4]:
labeled['label_b'] = labeled.label.apply(lambda x: 1 if x==4 else 0)

In [5]:
labeled['cleaned'] = labeled.Review.apply(clean_comment)

In [6]:
labeled.cleaned.head()

57204                                    Thismgame is fun 
18260    Best ive played since a kid when it first came...
52241                           I think this app is great 
13087                     I can t download The New update 
42370    If this game wasn t made my life would be took...
Name: cleaned, dtype: object

In [7]:
unlabeled['label_b'] = -1

In [8]:
total = pd.concat([labeled[['Review', 'label_b']], unlabeled[['Review','label_b']]], axis = 0)

In [76]:
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier, LassoLarsCV
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [195]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
vect = CountVectorizer(ngram_range = (1,3), analyzer = 'word',
                       stop_words = 'english',
                       max_features = 500,
                       min_df = 2, max_df = 0.95).fit(labeled.Review)
feats = vect.transform(labeled.Review).toarray()
labels = labeled.label_b.as_matrix()

In [196]:
feats.shape

(1000, 500)

In [197]:
X_train, X_test, y_train, y_test = train_test_split(feats, labels, test_size=0.2)

In [198]:
def eval_model(y_train,y_test,y_train_pred,y_test_pred):
    
    class_names = ['unknown',
        'Crash',
        'Balance problems',
        'Synchronization',
        'Positive',
        'Bug']
    
    class_names_b = ['neg', 'pos']
    print('train scores\n')
    print(classification_report(y_train, y_train_pred, target_names = class_names_b))
    print('test scores\n')
    print(classification_report(y_test, y_test_pred, target_names = class_names_b))

In [199]:
m = OneVsRestClassifier(DummyClassifier()).fit(X_train, y_train)
y_train_pred = m.predict(X_train)
y_test_pred = m.predict(X_test)
eval_model(y_train,y_test,y_train_pred,y_test_pred)

train scores

             precision    recall  f1-score   support

        neg       0.37      0.37      0.37       303
        pos       0.61      0.61      0.61       497

avg / total       0.52      0.52      0.52       800

test scores

             precision    recall  f1-score   support

        neg       0.44      0.44      0.44        73
        pos       0.68      0.69      0.68       127

avg / total       0.59      0.59      0.59       200



In [200]:
m = MultinomialNB().fit(X_train, y_train)
y_train_pred = m.predict(X_train)
y_test_pred = m.predict(X_test)
eval_model(y_train,y_test,y_train_pred,y_test_pred)

train scores

             precision    recall  f1-score   support

        neg       0.95      0.77      0.85       303
        pos       0.87      0.98      0.92       497

avg / total       0.90      0.90      0.89       800

test scores

             precision    recall  f1-score   support

        neg       0.90      0.62      0.73        73
        pos       0.81      0.96      0.88       127

avg / total       0.84      0.83      0.83       200



In [71]:
m = KNeighborsClassifier(n_neighbors = 10).fit(X_train, y_train)
y_train_pred = m.predict(X_train)
y_test_pred = m.predict(X_test)
eval_model(y_train,y_test,y_train_pred,y_test_pred)

train scores

             precision    recall  f1-score   support

        neg       0.88      0.49      0.63       293
        pos       0.77      0.96      0.85       507

avg / total       0.81      0.79      0.77       800

test scores

             precision    recall  f1-score   support

        neg       0.85      0.42      0.56        83
        pos       0.70      0.95      0.80       117

avg / total       0.76      0.73      0.70       200



In [19]:
val_en = pd.read_excel('validation_en.xlsx')

In [21]:
def eval_classifier(input_text,model = m):
    feats = vect.transform([input_text])
    class_names = ['unknown',
        'Crash',
        'Balance problems',
        'Synchronization',
        'Positive',
        'Bug']
    class_names_b = ['neg', 'pos']
    prediction = model.predict(feats.toarray())
    #print(class_names[prediction[0]])
    return class_names_b[prediction[0]]

In [26]:
val_en.Bug.apply(eval_classifier)

0    neg
1    pos
2    neg
3    neg
4    neg
5    neg
6    neg
7    neg
8    pos
9    neg
Name: Bug, dtype: object

In [11]:
from sklearn.semi_supervised import label_propagation
from scipy.sparse import csgraph
lp_model = label_propagation.LabelSpreading(kernel = 'knn',gamma = 1000, n_neighbors = 10,alpha = 0.25, max_iter=15, n_jobs = -1)
lp_model.fit(feats, labels)

  self.label_distributions_ /= normalizer


LabelSpreading(alpha=0.25, gamma=1000, kernel='knn', max_iter=15, n_jobs=-1,
        n_neighbors=10, tol=0.001)

In [12]:
lp_model.n_iter_

8

In [13]:
pd.Series(lp_model.transduction_).value_counts()

1    9602
0    4198
dtype: int64

In [14]:
m = lp_model

In [15]:
labels_p = lp_model.transduction_

In [16]:
total['label_p'] = labels_p

In [17]:
total.label_p.value_counts()

1    9602
0    4198
Name: label_p, dtype: int64

In [22]:
val_en.Balance.apply(eval_classifier)

0    neg
1    pos
2    pos
3    pos
4    neg
5    pos
6    neg
7    neg
8    neg
9    pos
Name: Balance, dtype: object

In [105]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
vect = CountVectorizer(ngram_range = (1,2), analyzer = 'word',
                       stop_words = 'english',
                       strip_accents = 'unicode',
                       min_df = 2, max_df = 0.95).fit(total.Review)
feats = vect.transform(total.Review).toarray()
labels = total.label_p.as_matrix()

In [106]:
feats.shape

(13800, 9994)

In [107]:
X_train, X_test, y_train, y_test = train_test_split(feats, labels, test_size=0.2)

In [176]:
m = MLPClassifier(activation = 'relu',
                  hidden_layer_sizes = [50,50,25],
                  early_stopping = True, validation_fraction = 0.2,
                  verbose = 1).fit(X_train, y_train)
y_train_pred = m.predict(X_train)
y_test_pred = m.predict(X_test)
eval_model(y_train,y_test,y_train_pred,y_test_pred)

Iteration 1, loss = 0.64756346
Validation score: 0.706975
Iteration 2, loss = 0.44269012
Validation score: 0.825181
Iteration 3, loss = 0.25299290
Validation score: 0.835598
Iteration 4, loss = 0.14227583
Validation score: 0.836504
Iteration 5, loss = 0.08732756
Validation score: 0.831069
Iteration 6, loss = 0.05936568
Validation score: 0.830616
Iteration 7, loss = 0.04568356
Validation score: 0.826540
Validation score did not improve more than tol=0.000100 for two consecutive epochs. Stopping.
train scores

             precision    recall  f1-score   support

        neg       0.90      0.93      0.91      3365
        pos       0.97      0.96      0.96      7675

avg / total       0.95      0.95      0.95     11040

test scores

             precision    recall  f1-score   support

        neg       0.69      0.72      0.70       833
        pos       0.88      0.86      0.87      1927

avg / total       0.82      0.82      0.82      2760



In [190]:
m = MultinomialNB().fit(X_train, y_train)
y_train_pred = m.predict(X_train)
y_test_pred = m.predict(X_test)
eval_model(y_train,y_test,y_train_pred,y_test_pred)

train scores

             precision    recall  f1-score   support

        neg       0.80      0.56      0.66      3365
        pos       0.83      0.94      0.88      7675

avg / total       0.82      0.82      0.81     11040

test scores

             precision    recall  f1-score   support

        neg       0.61      0.47      0.53       833
        pos       0.79      0.87      0.83      1927

avg / total       0.73      0.75      0.74      2760



In [202]:
def eval_classifier(input_text,model = m):
    feats = vect.transform([input_text])
    class_names = ['unknown',
        'Crash',
        'Balance problems',
        'Synchronization',
        'Positive',
        'Bug']
    class_names_b = ['neg', 'pos', 'uncertain']
    prediction = model.predict(feats.toarray())
    """
    if prediction_prob[0][1]>0.9:
        prediction = 1
    elif prediction_prob[0][0]>0.6:
        prediction = 0
    else:
        prediction = 2
    """
    #print(class_names[prediction[0]])
    return prediction[0]

In [215]:
val_en.Bug.apply(eval_classifier)

0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
8    1
9    0
Name: Bug, dtype: int64

In [225]:
unlabeled['label_p'] = unlabeled.Review.apply(eval_classifier)

In [226]:
unlabeled_neg = unlabeled[unlabeled['label_p']<1]

In [230]:
unlabeled_neg.loc[:,'label'] = -1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [231]:
labeled_neg = labeled[labeled.label!=4]

In [232]:
total = pd.concat([labeled_neg[['Review', 'label']], unlabeled_neg[['Review','label']]], axis = 0)

In [234]:
total.label.value_counts()

-1    3875
 2     203
 5      74
 0      60
 3      31
 1       8
Name: label, dtype: int64

In [239]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
vect = CountVectorizer(ngram_range = (1,3), analyzer = 'word',
                       stop_words = 'english',
                       max_features = 500,
                       min_df = 2, max_df = 0.95).fit(total.Review)
feats = vect.transform(total.Review).toarray()
labels = total.label.as_matrix()

In [249]:
from sklearn.semi_supervised import label_propagation
from scipy.sparse import csgraph
lp_model = label_propagation.LabelSpreading(kernel = 'knn',gamma = 1000, n_neighbors = 15,alpha = 0.1, max_iter=15, n_jobs = -1)
lp_model.fit(feats, labels)

  self.label_distributions_ /= normalizer


LabelSpreading(alpha=0.1, gamma=1000, kernel='knn', max_iter=15, n_jobs=-1,
        n_neighbors=15, tol=0.001)

In [250]:
lp_model.n_iter_

6

In [251]:
pd.Series(lp_model.transduction_).value_counts()

0    2067
2    1480
5     489
3     136
1      79
dtype: int64

In [252]:
labels_p = lp_model.transduction_

In [253]:
total['label_p'] = labels_p

In [264]:
total['label_p'] = total.label_p.apply(lambda x: 4 if x == 5 else x)

In [265]:
total.label_p.value_counts()

0    2067
2    1480
4     489
3     136
1      79
Name: label_p, dtype: int64

In [341]:
total['cleaned'] = total.Review.apply(clean_comment)

In [409]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
vect = CountVectorizer(ngram_range = (1,5), analyzer = 'char_wb',
                       stop_words = 'english',
                       min_df = 2, max_df = 0.95).fit(total.Review)
feats = vect.transform(total.Review).toarray()
labels = total.label_p.as_matrix()

In [410]:
feats.shape

(4251, 24439)

In [411]:
X_train, X_test, y_train, y_test = train_test_split(feats, labels, test_size=0.2)

In [412]:
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier, LassoLarsCV
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [413]:
def eval_model(y_train,y_test,y_train_pred,y_test_pred):
    
    class_names = ['unknown',
        'Crash',
        'Balance problems',
        'Synchronization',
        #'Positive',
        'Bug']
    
    class_names_b = ['neg', 'pos']
    print('train scores\n')
    print(classification_report(y_train, y_train_pred, target_names = class_names))
    print('test scores\n')
    print(classification_report(y_test, y_test_pred, target_names = class_names))

In [415]:
m = LogisticRegression(C = 0.01,
                 class_weight = 'balanced').fit(X_train, y_train)
y_train_pred = m.predict(X_train)
y_test_pred = m.predict(X_test)
eval_model(y_train,y_test,y_train_pred,y_test_pred)

train scores

                  precision    recall  f1-score   support

         unknown       0.87      0.79      0.83      1657
           Crash       0.62      1.00      0.76        69
Balance problems       0.80      0.79      0.79      1184
 Synchronization       0.68      0.90      0.77       116
             Bug       0.65      0.81      0.72       374

     avg / total       0.81      0.80      0.80      3400

test scores

                  precision    recall  f1-score   support

         unknown       0.74      0.71      0.73       410
           Crash       0.14      0.30      0.19        10
Balance problems       0.71      0.68      0.69       296
 Synchronization       0.12      0.20      0.15        20
             Bug       0.54      0.56      0.55       115

     avg / total       0.68      0.66      0.67       851



In [416]:
def eval_classifier(input_text,model = m):
    feats = vect.transform([input_text])
    class_names = ['unknown',
        'Crash',
        'Balance problems',
        'Synchronization',
        #'Positive',
        'Bug']
    class_names_b = ['neg', 'pos', 'uncertain']
    prediction = model.predict(feats.toarray())
    """
    if prediction_prob[0][1]>0.9:
        prediction = 1
    elif prediction_prob[0][0]>0.6:
        prediction = 0
    else:
        prediction = 2
    """
    #print(class_names[prediction[0]])
    return class_names[prediction[0]]

In [417]:
val_en.Bug.apply(eval_classifier)

0    Balance problems
1    Balance problems
2               Crash
3                 Bug
4                 Bug
5                 Bug
6    Balance problems
7    Balance problems
8             unknown
9             unknown
Name: Bug, dtype: object

In [420]:
total.to_csv('label_prop_neg.csv')

In [419]:
total.label_p.value_counts()

0    2067
2    1480
4     489
3     136
1      79
Name: label_p, dtype: int64