In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
comments = pd.read_csv('comments_en_cleaned.csv')
comments_en = comments[comments.lang == 'en']
vect = CountVectorizer(ngram_range = (1,1), analyzer = 'word',
                       stop_words = 'english',
                       max_features = 500,
                       min_df = 2, max_df = 0.95).fit(comments_en.Review)
pw = list(vect.vocabulary_.keys())

In [3]:
import enchant
c = enchant.Dict("en_UK")
def check_spelling(text):
    if not c.check(text):
        suggestions = list(set(c.suggest(text)).intersection(set(pw)))
        if len(suggestions)>0:
            res = suggestions[0]
        else:
            res = text
    else:
        res = text
    return res

In [4]:
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re
import nltk
from nltk.corpus import wordnet
from emoji.unicode_codes import UNICODE_EMOJI
import emoji
def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return ''

def clean_comment(text):
    wnl = WordNetLemmatizer()
    deacc = re.sub(r'\!',' exclamation_point ', text)
    tokens = word_tokenize(deacc)
    tags = nltk.pos_tag(tokens)
    processed = []
    for (word, tag) in tags:
        wn_tag = get_wordnet_pos(tag)
        if wn_tag!='':
            processed.append(wnl.lemmatize(word,wn_tag))
        else:
            processed.append(wnl.lemmatize(check_spelling(word)))
    res = ' '.join(processed)
    return res

In [97]:
classes_nums = {
    'Balance':1,
    'Graphics':2,
    'Bug':3,
    'Advertising':4,
    'Monetization':5,
    'Other':0
}
labeled4 = pd.read_excel('temp data/for_labeling 4.xlsx').loc[:,['Review', 'Label']]
labeled1 = pd.read_excel('temp data/for_labeling 1.xlsx').loc[:,['Review', 'Label']]
labeled2 = pd.read_excel('temp data/for_labeling 2.xlsx').loc[:,['Review', 'Label']]
labeled2 = labeled2[(labeled2.Label!='?')&(labeled2.Label!='-')]
labeled1['label_num'] = labeled1.Label.map(classes_nums)
labeled4['label_num'] = labeled4.Label.map(classes_nums)
labeled2['label_num'] = labeled2.Label

In [98]:
labeled = pd.concat([labeled4, labeled2, labeled1], axis = 0)
labeled = labeled.dropna(axis = 0)
labeled.label_num = labeled.label_num.apply(int)

In [99]:
def cut_major_class(df, frac = 0.5):
    major_class = df.Label.value_counts().index[0]
    dfmc = df[df.Label==major_class].sample(frac = frac)
    df_rest = df[df.Label!=major_class]
    return pd.concat([dfmc, df_rest],axis = 0)

In [100]:
#labeled = cut_major_class(labeled, frac = 0.25)

In [101]:
labeled.shape

(3104, 3)

In [102]:
labeled.loc[:,'cleaned'] = labeled.Review.apply(clean_comment)

In [104]:
check_spelling('awsum')

'awesome'

In [105]:
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score

In [191]:
vect = CountVectorizer(ngram_range = (1,1), analyzer = 'word',
                       stop_words = 'english',
                       #max_features = 10000,
                       min_df = 2, max_df = 0.95).fit(comments_en.cleaned)
vocab = vect.vocabulary_

In [192]:
def eval_model(y_train,y_test,y_train_pred,y_test_pred):
    
    class_names = ['Other',
        'Balance',
        'Graphics',
        'Bug',
        'Advertising',
        'Monetization']
    
    class_names_b = ['neg', 'pos']
    print('train scores\n')
    print(classification_report(y_train, y_train_pred, target_names = class_names))
    print('test scores\n')
    print(classification_report(y_test, y_test_pred, target_names = class_names))

In [193]:
from sklearn.pipeline import Pipeline
vect = CountVectorizer()
model = MultinomialNB()
lin_model = Pipeline([('vectorizer', vect), ('classifier', model)])
lin_model.set_params(vectorizer__ngram_range = (1,3),vectorizer__analyzer = 'word', vectorizer__stop_words = 'english',
                     vectorizer__max_features = 500,
                     vectorizer__min_df = 2, vectorizer__max_df = 0.95,
                     vectorizer__vocabulary = vocab,
                     
                     classifier__class_weight = 'balanced', classifier__C = 0.01)
feats = labeled.cleaned
labels = labeled.label_num
X_train, X_test, y_train, y_test = train_test_split(feats, labels, test_size=0.2)
lin_model = lin_model.fit(X_train, y_train)
y_train_pred = lin_model.predict(X_train)
y_test_pred = lin_model.predict(X_test)
eval_model(y_train,y_test,y_train_pred,y_test_pred)
print('cross_val_score: %1.4f'% (np.mean(cross_val_score(lin_model, labeled.cleaned,labeled.label_num))))

train scores

              precision    recall  f1-score   support

       Other       0.86      0.98      0.92      2002
     Balance       0.72      0.21      0.33       107
    Graphics       0.72      0.70      0.71        44
         Bug       0.82      0.27      0.41       195
 Advertising       0.67      0.67      0.67         3
Monetization       0.68      0.34      0.45       132

 avg / total       0.84      0.85      0.82      2483

test scores

              precision    recall  f1-score   support

       Other       0.85      0.99      0.91       499
     Balance       0.50      0.07      0.12        28
    Graphics       0.60      0.50      0.55         6
         Bug       0.75      0.15      0.25        61
 Advertising       0.00      0.00      0.00         2
Monetization       0.56      0.40      0.47        25

 avg / total       0.81      0.83      0.79       621



  'precision', 'predicted', average, warn_for)


cross_val_score: 0.8273


In [176]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
vect = CountVectorizer()
model = RidgeClassifier()
lin_model = Pipeline([('vectorizer', vect), ('classifier', model)])
lin_model.set_params(vectorizer__analyzer = 'word', vectorizer__stop_words = 'english',
                     vectorizer__min_df = 2, vectorizer__max_df = 0.95,
                     
                     classifier__class_weight = 'balanced')

param_grid = {
        'vectorizer__ngram_range': [(1,1), (1,2), (1,3)],
        'vectorizer__max_features': [200,250,300,350,400],        
        'classifier__alpha':[0.1, 0.2, 0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0]
    }

gs = GridSearchCV(lin_model, cv=3, n_jobs=-1, param_grid=param_grid, verbose = 1)
feats = labeled.cleaned
labels = labeled.label_num
X_train, X_test, y_train, y_test = train_test_split(feats, labels, test_size=0.2)
gs = gs.fit(feats, labels)

Fitting 3 folds for each of 135 candidates, totalling 405 fits




KeyboardInterrupt: 

In [332]:
gs.best_score_

0.77054631828978626

In [194]:
from sklearn.metrics import f1_score,roc_auc_score,accuracy_score
def eval_classifier(input_text,model):
    cleaned_text = clean_comment(input_text)
    feats = vect.transform([cleaned_text])
    class_names = ['Other',
        'Balance',
        'Graphics',
        'Bug',
        'Advertising',
        'Monetization']
    prediction = model.predict(feats.toarray())
    #print(class_names[prediction[0]])
    return prediction[0]
def eval_pipeline(input_text, model = lin_model):
    cleaned_text = clean_comment(input_text)
    class_names = ['Other',
        'Balance',
        'Graphics',
        'Bug',
        'Advertising',
        'Monetization']
    prediction = model.predict([cleaned_text])
    #print(class_names[prediction[0]])
    return prediction[0]
def val_score(model):
    class_names = ['Other',
        'Balance',
        'Graphics',
        'Bug',
        'Advertising',
        'Monetization']   
    val_en = pd.read_excel('temp data/val google play.xlsx')
    classes_nums = {
        'Balance':1,
        'Graphics':2,
        'Bug':3,
        'Advertising':4,
        'Monetization':5,
        'Other':0
    }    
    val_en['label_num'] = val_en.Label.map(classes_nums)
    y_true = val_en.label_num
    y_pred = val_en.Review.apply(eval_pipeline)
    print(classification_report(y_true, y_pred, target_names = class_names))
    print('model accuracy %1.4f'%(accuracy_score(y_true, y_pred)))
    return y_true,y_pred
y_true,y_pred = val_score(lin_model)

              precision    recall  f1-score   support

       Other       0.35      1.00      0.52        11
     Balance       0.50      0.33      0.40         6
    Graphics       0.00      0.00      0.00         1
         Bug       1.00      0.33      0.50        27
 Advertising       0.50      0.60      0.55         5

 avg / total       0.73      0.50      0.49        50

model accuracy 0.5000


  .format(len(labels), len(target_names))
  'precision', 'predicted', average, warn_for)


In [180]:
eval_pipeline('It\'s good but it crashes way to much and it\'s so pay to win I could make a whole day ranting about how unfair it is and maybe your game might become better if you listen what other people want like no armor sever and more this game could be better if you listened to what other people want like less overpowered weapons and less overpriced things')

1

In [181]:
eval_pipeline('it keeps putting me against 20-25 lvl players when i am just 12')

0

In [182]:
eval_pipeline('graphics are strong in this one')

2

In [183]:
eval_pipeline('the game crashes like every 5 minutes')

0

In [184]:
eval_pipeline('Way to many ads. Thats so annoying')

4

In [185]:
eval_pipeline('weapons prices are just crazy! pg3d r u nuts?')

0

In [186]:
eval_pipeline('The Emperor protects')

0