In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
comments = pd.read_csv('comments_en_cleaned.csv')
comments_en = comments[comments.lang == 'en']
vect = CountVectorizer(ngram_range = (1,1), analyzer = 'word',
                       stop_words = 'english',
                       max_features = 500,
                       min_df = 2, max_df = 0.95).fit(comments_en.Review)
pw = list(vect.vocabulary_.keys())

In [3]:
with open("pw.txt", "w") as text_file:
    for w in pw:
        print(w, file=text_file)

In [4]:
import enchant
c = enchant.Dict("en_UK")
def check_spelling(text):
    if not c.check(text):
        suggestions = list(set(c.suggest(text)).intersection(set(pw)))
        if len(suggestions)>0:
            res = suggestions[0]
        elif len(c.suggest(text))>0:
            res = c.suggest(text)[0]
        else:
            res = text
    else:
        res = text
    return res

In [5]:
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize,TreebankWordTokenizer
import re
import nltk
from nltk.corpus import wordnet
from emoji.unicode_codes import UNICODE_EMOJI
import emoji
def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return ''

def clean_comment(text):
    wnl = WordNetLemmatizer()
    tokenizer = TreebankWordTokenizer()
    #deacc = re.sub(r'\!',' exclamation_point ', text)
    tokens = tokenizer.tokenize(text)
    tags = nltk.pos_tag(tokens)
    processed = []
    for (word, tag) in tags:
        wn_tag = get_wordnet_pos(tag)
        if wn_tag!='':
            processed.append(wnl.lemmatize(check_spelling(word),wn_tag))
        else:
            processed.append(wnl.lemmatize(check_spelling(word)))
    res = ' '.join(processed)
    return res

In [6]:
clean_comment('tihs game is $42 assome')

'this game be $ W awesome'

In [7]:
classes_nums = {
    'Balance':1,
    'Graphics':2,
    'Bug':3,
    'Advertising':4,
    'Monetization':5,
    'Other':0
}
labeled4 = pd.read_excel('temp data/for_labeling 4.xlsx').loc[:,['Review', 'Label']]
labeled1 = pd.read_excel('temp data/for_labeling 1.xlsx').loc[:,['Review', 'Label']]
labeled2 = pd.read_excel('temp data/for_labeling 2.xlsx').loc[:,['Review', 'Label']]
labeled2 = labeled2[(labeled2.Label!='?')&(labeled2.Label!='-')]
labeled1['label_num'] = labeled1.Label.map(classes_nums)
labeled4['label_num'] = labeled4.Label.map(classes_nums)
labeled2['label_num'] = labeled2.Label

In [8]:
labeled = pd.concat([labeled4, labeled2, labeled1], axis = 0)
labeled = labeled.dropna(axis = 0)
labeled.label_num = labeled.label_num.apply(int)

In [9]:
def cut_major_class(df, frac = 0.5):
    major_class = df.Label.value_counts().index[0]
    dfmc = df[df.Label==major_class].sample(frac = frac)
    df_rest = df[df.Label!=major_class]
    return pd.concat([dfmc, df_rest],axis = 0)

In [10]:
#labeled = cut_major_class(labeled, frac = 0.5)

In [11]:
labeled.shape

(4265, 3)

In [12]:
#comments_en['cleaned'] = comments_en['Review'].apply(clean_comment)

In [13]:
labeled.loc[:,'cleaned'] = labeled.Review.apply(clean_comment)

In [14]:
labeled.head()

Unnamed: 0,Review,Label,label_num,cleaned
65372,Its amazing i love it,Other,0,Its amazing i love it
65390,it's fun but I can't download it in my phone,Bug,3,it es fun but I ca NT download it in my phone
65394,This is the best game,Other,0,This be the best game
65402,This is beautiful game this is so good and plz...,Other,0,This be beautiful game this be so good and Pl ...
65416,MORE weapons and maps pls. Oh and also make ma...,Balance,1,MORE weapon and map plus Oh and also make matc...


In [15]:
labeled.label_num.value_counts()#/len(labeled)

0    3452
3     355
5     221
1     165
2      63
4       9
Name: label_num, dtype: int64

In [16]:
labeled_tot = pd.read_excel('model_labeled.xlsx')

In [18]:
labeled_tot.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Rating,Review,lang,cleaned,model_label
0,5,5,3,"I really like this game, I am currently on lev...",en,"I really like this game , I be currently on le...",5
1,6,6,3,I like it I can just never get good weapons th...,en,I like it I can just never get good weapon the...,5
2,10,10,5,I really love this game it contains strategy a...,en,I really love this game it contain strategy an...,0
3,17,17,5,It good to play the game,en,It good to play the game,0
4,25,25,4,Nice game,en,Nice game,0


In [16]:
#labeled.to_excel('new_labeled_4000.xlsx')

In [21]:
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier,RidgeClassifierCV,LogisticRegressionCV, SGDClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score

In [121]:
vect = CountVectorizer(ngram_range = (1,3), analyzer = 'word',
                       stop_words = 'english',
                       #max_features = 10000,
                       min_df = 2, max_df = 0.95).fit(comments_en.cleaned)
vocab = vect.vocabulary_

In [19]:
def eval_model(y_train,y_test,y_train_pred,y_test_pred):
    
    class_names = ['Other',
        'Balance',
        'Graphics',
        'Bug',
        'Advertising',
        'Monetization']
    
    class_names_b = ['neg', 'pos']
    print('train scores\n')
    print(classification_report(y_train, y_train_pred, target_names = class_names))
    print('test scores\n')
    print(classification_report(y_test, y_test_pred, target_names = class_names))

In [20]:
from sklearn.base import TransformerMixin
class DenseTransformer(TransformerMixin):

    def transform(self, X, y=None, **fit_params):
        return X.todense()

    def fit_transform(self, X, y=None, **fit_params):
        self.fit(X, y, **fit_params)
        return self.transform(X)

    def fit(self, X, y=None, **fit_params):
        return self

In [93]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import MaxAbsScaler
from sklearn.linear_model import LassoCV
vect = CountVectorizer()
model = RidgeClassifierCV()
scaler = MaxAbsScaler()
lin_model = Pipeline([('vectorizer', vect),
                      #('features', FeatureUnion([
                      #  ('ngram_tf_idf', Pipeline([
                      #    ('counts', CountVectorizer()),
                      #    ('tf_idf', TfidfTransformer())]))
                      #])),
                        #('scaler', scaler),
                      ('classifier', model)])
lin_model.set_params(vectorizer__ngram_range = (1,3),vectorizer__analyzer = 'word', vectorizer__stop_words = 'english',
                     vectorizer__max_features = 800,
                     vectorizer__min_df = 2, vectorizer__max_df = 0.95,
                     vectorizer__vocabulary = vocab,
                     
                     classifier__class_weight = 'balanced')#,classifier__alphas = np.linspace(start = 0.01, stop = 100, num = 50))
feats = labeled.cleaned
labels = labeled.label_num
lin_model = lin_model.fit(feats, labels)
print('cross_val_score: %1.4f'% (np.mean(cross_val_score(lin_model, labeled.cleaned,labeled.label_num))))

cross_val_score: 0.8457


In [22]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import MaxAbsScaler
from sklearn.linear_model import LassoCV
from sklearn.ensemble import BaggingClassifier
vect = CountVectorizer(ngram_range = (1,3), analyzer = 'word',
                       stop_words = 'english',
                       #max_features = 10000,
                       min_df = 1, max_df = 0.95).fit(labeled.cleaned)
model = LogisticRegressionCV(class_weight = 'balanced')
boosted = BaggingClassifier(base_estimator = model, max_features = 1000, bootstrap_features = True, verbose = 1, n_jobs = -1)
feats = vect.transform(labeled.cleaned)
labels = labeled.label_num
boosted = boosted.fit(feats, labels)
print('cross_val_score: %1.4f'% (np.mean(cross_val_score(boosted, feats,labels))))

KeyboardInterrupt: 

In [75]:
def eval_boosted(input_text,model = boosted, vectorizer = vect):
    cleaned_text = clean_comment(input_text)
    feats = vectorizer.transform([cleaned_text])
    class_names = ['Other',
        'Balance',
        'Graphics',
        'Bug',
        'Advertising',
        'Monetization']
    prediction = model.predict(feats.toarray())
    #print(class_names[prediction[0]])
    return class_names[prediction[0]]

In [76]:
eval_boosted(' They nerfed the prototype so much,its disgusting. They need to bring the prototype back,its the main reason why I even fight and use the game. They also need to bring back the private servers in sandbox. THEY GOT RID OF THE SWIPE OPTION FOR THE CONTROLS,it\'s slower to use now thanks to that. I might just quit at this point thought it was 5 star when i began it was like that for 3 years,now,it\'s a 1 or 2 star. ')

[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.1s finished


'Other'

In [26]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import MaxAbsScaler
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import LinearSVC
vect = CountVectorizer()
model = LinearSVC(class_weight = 'balanced')
scaler = MaxAbsScaler()
lin_model = Pipeline([#('vectorizer', vect),
                      ('features', FeatureUnion([
                        ('ngram_tf_idf', Pipeline([
                          ('counts', CountVectorizer(ngram_range = (1,3), max_features = 5000, stop_words = 'english')),
                          ('tf_idf', TfidfTransformer())]))
                      ])),
                        ('scaler', scaler),
                      ('classifier', AdaBoostClassifier(base_estimator = model, algorithm='SAMME'))])
feats = labeled_tot.cleaned
labels = labeled_tot.model_label
#X_train, X_test, y_train, y_test = train_test_split(feats.tolist(), labels.tolist(), test_size=0.2)
lin_model = lin_model.fit(feats, labels)
#y_train_pred = lin_model.predict(X_train)
#y_test_pred = lin_model.predict(X_test)
#eval_model(y_train,y_test,y_train_pred,y_test_pred)
print('cross_val_score: %1.4f'% (np.mean(cross_val_score(lin_model, labeled.cleaned,labeled.label_num))))

cross_val_score: 0.8370


In [139]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import MaxAbsScaler
from sklearn.ensemble import AdaBoostClassifier
vect = CountVectorizer()
model = LogisticRegressionCV()
scaler = MaxAbsScaler()
lin_model = Pipeline([#('vectorizer', vect),
                      ('features', FeatureUnion([
                        ('ngram_tf_idf', Pipeline([
                          ('counts', CountVectorizer(ngram_range = (1,3),
                                                     vocabulary = vocab,
                                                     max_features = 800,
                                                     stop_words = 'english')),
                          ('tf_idf', TfidfTransformer())]))
                      ])),
                        ('scaler', scaler),
                      ('classifier', model)])
lin_model.set_params(#vectorizer__ngram_range = (1,3),vectorizer__analyzer = 'word', vectorizer__stop_words = 'english',
                     #vectorizer__max_features = 1000,
                     #vectorizer__min_df = 2, vectorizer__max_df = 0.95,
                     #vectorizer__vocabulary = vocab,
                     
                     classifier__class_weight = 'balanced')#,classifier__Cs = np.logspace(start = 0.01, stop = 5, num = 25))
feats = labeled.cleaned
labels = labeled.label_num
#X_train, X_test, y_train, y_test = train_test_split(feats.tolist(), labels.tolist(), test_size=0.2)
lin_model = lin_model.fit(feats, labels)
#y_train_pred = lin_model.predict(X_train)
#y_test_pred = lin_model.predict(X_test)
#eval_model(y_train,y_test,y_train_pred,y_test_pred)
print('cross_val_score: %1.4f'% (np.mean(cross_val_score(lin_model, labeled.cleaned,labeled.label_num))))

cross_val_score: 0.8356


In [28]:
import pickle
with open('logit.pkl', 'wb') as f:
    pickle.dump(lin_model, f)

In [27]:
from sklearn.metrics import f1_score,roc_auc_score,accuracy_score
def eval_classifier(input_text,model):
    cleaned_text = clean_comment(input_text)
    feats = vect.transform([cleaned_text])
    class_names = ['Other',
        'Balance',
        'Graphics',
        'Bug',
        'Advertising',
        'Monetization']
    prediction = model.predict(feats.toarray())
    #print(class_names[prediction[0]])
    return prediction[0]
def eval_pipeline(input_text, model = lin_model):
    cleaned_text = clean_comment(input_text)
    class_names = ['Other',
        'Balance',
        'Graphics',
        'Bug',
        'Advertising',
        'Monetization']
    prediction = model.predict([cleaned_text])
    #print(class_names[prediction[0]])
    return prediction[0]
def val_score(model):
    class_names = ['Other',
        'Balance',
        'Graphics',
        'Bug',
        'Advertising',
        'Monetization']   
    val_en = pd.read_excel('temp data/val google play 2.xlsx')
    classes_nums = {
        'Balance':1,
        'Graphics':2,
        'Bug':3,
        'Advertising':4,
        'Monetization':5,
        'Other':0
    }    
    #val_en['label_num'] = val_en.Label.map(classes_nums)
    y_true = val_en.Label
    y_pred = val_en.Review.apply(eval_pipeline)
    val_en['predicted'] = y_pred
    val_en.to_excel('predicted.xlsx')
    print(classification_report(y_true, y_pred, target_names = class_names))
    print('model accuracy %1.4f'%(accuracy_score(y_true, y_pred)))
    return y_true,y_pred
y_true,y_pred = val_score(lin_model)

              precision    recall  f1-score   support

       Other       0.57      0.80      0.67         5
     Balance       1.00      0.60      0.75         5
    Graphics       1.00      1.00      1.00         5
         Bug       0.80      0.80      0.80         5
 Advertising       1.00      0.60      0.75         5
Monetization       0.57      0.80      0.67         5

 avg / total       0.82      0.77      0.77        30

model accuracy 0.7667


In [29]:
eval_pipeline('The reason I gave the game 1 star is because most of my guns are tons of coins to buy them like the Bad Doctor gun, it was 980 coins to buy it. When I checked on my little brothers phone and looked at the prices, the Bad Doctor gun was 140 coins for him. Please fix my prices and I will 100% rate this game 5 stars because I LOVE this game. PLEASE FIX THIS BECAUSE I CAN BARELY GET ANY GUN.')

3

In [30]:
eval_pipeline('It\'s good but it crashes way to much and it\'s so pay to win I could make a whole day ranting about how unfair it is and maybe your game might become better if you listen what other people want like no armor sever and more this game could be better if you listened to what other people want like less overpowered weapons and less overpriced things')

5

In [31]:
eval_pipeline('prototype was supposed to be powerful btw')

1

In [32]:
eval_pipeline('graphics are strong in this one')

2

In [33]:
eval_pipeline('the game crashes like every 5 minutes')

3

In [34]:
eval_pipeline('Way to many ads. Thats so annoying')

4

In [35]:
eval_pipeline('weapons prices are just crazy! pg3d r u nuts?')

5

In [36]:
eval_pipeline('The game is so unfair when i level up some higher than me pops up i die all the time you earn money and you cant fix this ')

5

In [37]:
eval_pipeline('BUUURN HERETIC!!')

3

In [115]:
comments_en['model_label'] = comments_en.Review.apply(eval_pipeline)

In [116]:
comments_en['model_label'].value_counts()/len(comments_en)

0    0.719442
3    0.116924
5    0.102770
1    0.041957
2    0.017036
4    0.001870
Name: model_label, dtype: float64

In [117]:
comments_en.to_excel('model_labeled.xlsx')