In [1]:
import pandas as pd
import numpy as np
import nltk

In [2]:
labeled = pd.read_excel('manual_labels.xlsx')
unlabeled = pd.read_excel('manual_unlabeled.xlsx')

In [4]:
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re
def clean_comment(text):
    wnl = WordNetLemmatizer()
    deacc = re.sub(r'\W',' ', text)
    tokens = word_tokenize(deacc)
    res = ''
    for t in tokens:
        res += wnl.lemmatize(t)+' '
    return res
def get_tokens(text):
    wnl = WordNetLemmatizer()
    deacc = re.sub(r'\W',' ', text)
    tokens = word_tokenize(deacc)
    return tokens

In [5]:
labeled['tokens'] = labeled.Review.apply(get_tokens)

In [10]:
labeled_long = labeled[labeled.tokens.apply(len)>6]

In [11]:
labeled_long.head()

Unnamed: 0.1,Unnamed: 0,Rating,Review,lang,label,tokens
18260,18260,5,Best ive played since a kid when it first came...,en,4,"[Best, ive, played, since, a, kid, when, it, f..."
13087,13087,1,I can't download The New update,en,5,"[I, can, t, download, The, New, update]"
42370,42370,5,If this game wasn't made my life would be took...,en,4,"[If, this, game, wasn, t, made, my, life, woul..."
12018,12018,1,THIS SUCKS IT GAVE MME BACK ONLY MY ARMOR FOR ...,en,3,"[THIS, SUCKS, IT, GAVE, MME, BACK, ONLY, MY, A..."
14980,14980,5,I got every thing in the game yay,en,4,"[I, got, every, thing, in, the, game, yay]"


In [12]:
labeled_long.label.value_counts()

4    255
2    175
5     65
3     28
0     18
1      8
Name: label, dtype: int64

In [13]:
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier, LassoLarsCV
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [100]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
vect = CountVectorizer(ngram_range = (1,1), analyzer = 'word',
                       stop_words = 'english',
                       #max_features = 500,
                       min_df = 2, max_df = 0.95).fit(unlabeled.Review)
feats = vect.transform(labeled_long.Review).toarray()
labels = labeled_long.label.as_matrix()

In [101]:
feats.shape

(549, 3168)

In [102]:
X_train, X_test, y_train, y_test = train_test_split(feats, labels, test_size=0.2)

In [103]:
def eval_model(y_train,y_test,y_train_pred,y_test_pred):
    
    class_names = ['unknown',
        'Crash',
        'Balance problems',
        'Synchronization',
        'Positive',
        'Bug']
    
    class_names_b = ['neg', 'pos']
    print('train scores\n')
    print(classification_report(y_train, y_train_pred, target_names = class_names))
    print('test scores\n')
    print(classification_report(y_test, y_test_pred, target_names = class_names))

In [104]:
m = OneVsRestClassifier(DummyClassifier()).fit(X_train, y_train)
y_train_pred = m.predict(X_train)
y_test_pred = m.predict(X_test)
eval_model(y_train,y_test,y_train_pred,y_test_pred)

train scores

                  precision    recall  f1-score   support

         unknown       0.00      0.00      0.00        13
           Crash       0.00      0.00      0.00         7
Balance problems       0.28      0.10      0.15       118
 Synchronization       0.00      0.00      0.00        19
        Positive       0.48      0.43      0.45       180
             Bug       0.13      0.49      0.21        47

     avg / total       0.33      0.29      0.28       384

test scores

                  precision    recall  f1-score   support

         unknown       0.00      0.00      0.00         5
           Crash       0.00      0.00      0.00         1
Balance problems       0.33      0.14      0.20        57
 Synchronization       0.00      0.00      0.00         9
        Positive       0.44      0.45      0.45        75
             Bug       0.11      0.33      0.16        18

     avg / total       0.33      0.29      0.29       165



  'precision', 'predicted', average, warn_for)


In [128]:
m = LogisticRegression(class_weight = 'balanced', C = .5).fit(X_train, y_train)
y_train_pred = m.predict(X_train)
y_test_pred = m.predict(X_test)
eval_model(y_train,y_test,y_train_pred,y_test_pred)

train scores

                  precision    recall  f1-score   support

         unknown       0.93      1.00      0.96        13
           Crash       1.00      1.00      1.00         7
Balance problems       0.96      0.88      0.92       118
 Synchronization       1.00      1.00      1.00        19
        Positive       0.93      0.97      0.95       180
             Bug       0.91      0.91      0.91        47

     avg / total       0.94      0.94      0.94       384

test scores

                  precision    recall  f1-score   support

         unknown       0.00      0.00      0.00         5
           Crash       0.00      0.00      0.00         1
Balance problems       0.74      0.60      0.66        57
 Synchronization       0.67      0.22      0.33         9
        Positive       0.70      0.92      0.79        75
             Bug       0.64      0.39      0.48        18

     avg / total       0.68      0.68      0.66       165



In [129]:
val_en = pd.read_excel('validation_en.xlsx')

In [130]:
def eval_classifier(input_text,model = m):
    feats = vect.transform([input_text])
    class_names = ['unknown',
        'Crash',
        'Balance problems',
        'Synchronization',
        'Positive',
        'Bug']
    class_names_b = ['neg', 'pos']
    prediction = model.predict(feats.toarray())
    #print(class_names[prediction[0]])
    return class_names[prediction[0]]

In [134]:
val_en.Crash.apply(eval_classifier)

0            Positive
1                 Bug
2                 Bug
3    Balance problems
4               Crash
5               Crash
6               Crash
7                 Bug
8               Crash
9            Positive
Name: Crash, dtype: object