In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as pls

In [3]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
comments = pd.read_csv('comments_en_cleaned.csv')
comments_en = comments[comments.lang == 'en']
vect = CountVectorizer(ngram_range = (1,1), analyzer = 'word',
                       stop_words = 'english',
                       max_features = 500,
                       min_df = 2, max_df = 0.95).fit(comments_en.Review)
pw = list(vect.vocabulary_.keys())

In [4]:
with open("pw.txt", "w") as text_file:
    for w in pw:
        print(w, file=text_file)

In [5]:
import enchant
c = enchant.Dict("en_UK")
def check_spelling(text):
    if not c.check(text):
        suggestions = list(set(c.suggest(text)).intersection(set(pw)))
        if len(suggestions)>0:
            res = suggestions[0]
        elif len(c.suggest(text))>0:
            res = c.suggest(text)[0]
        else:
            res = text
    else:
        res = text
    return res

In [6]:
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize,TreebankWordTokenizer
import re
import nltk
from nltk.corpus import wordnet
from emoji.unicode_codes import UNICODE_EMOJI
import emoji
def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return ''

def clean_comment(text):
    wnl = WordNetLemmatizer()
    tokenizer = TreebankWordTokenizer()
    #deacc = re.sub(r'\!',' exclamation_point ', text)
    tokens = tokenizer.tokenize(text)
    tags = nltk.pos_tag(tokens)
    processed = []
    for (word, tag) in tags:
        wn_tag = get_wordnet_pos(tag)
        if wn_tag!='':
            processed.append(wnl.lemmatize(check_spelling(word),wn_tag))
        else:
            processed.append(wnl.lemmatize(check_spelling(word)))
    res = ' '.join(processed)
    return res

In [7]:
classes_nums = {
    'Balance':1,
    'Graphics':2,
    'Bug':3,
    'Advertising':4,
    'Monetization':5,
    'Other':0
}
labeled4 = pd.read_excel('temp data/for_labeling 4.xlsx').loc[:,['Review', 'Label']]
labeled1 = pd.read_excel('temp data/for_labeling 1.xlsx').loc[:,['Review', 'Label']]
labeled2 = pd.read_excel('temp data/for_labeling 2.xlsx').loc[:,['Review', 'Label']]
labeled2 = labeled2[(labeled2.Label!='?')&(labeled2.Label!='-')]
labeled1['label_num'] = labeled1.Label.map(classes_nums)
labeled4['label_num'] = labeled4.Label.map(classes_nums)
labeled2['label_num'] = labeled2.Label
labeled = pd.concat([labeled4, labeled2, labeled1], axis = 0)

In [12]:
labeled['label_num'] = labeled.label_num.fillna(-1)
labeled.label_num = labeled.label_num.apply(int)

In [13]:
labeled.loc[:,'cleaned'] = labeled.Review.apply(clean_comment)

In [20]:
from sklearn.base import TransformerMixin, BaseEstimator
class DenseTransformer(BaseEstimator, TransformerMixin):

    def transform(self, X, y=None, **fit_params):
        return X.todense()

    def fit_transform(self, X, y=None, **fit_params):
        self.fit(X, y, **fit_params)
        return self.transform(X)

    def fit(self, X, y=None, **fit_params):
        return self

In [37]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import MaxAbsScaler
from sklearn.linear_model import LassoCV
from sklearn.semi_supervised import label_propagation
from scipy.sparse import csgraph
from sklearn.model_selection import cross_val_score
vect = CountVectorizer()
#model = RidgeClassifierCV()
lp_model = label_propagation.LabelSpreading(kernel = 'rbf',gamma = 0.25, n_neighbors = 10,alpha = 0.5, max_iter=15, n_jobs = -1)
scaler = MaxAbsScaler()
lin_model = Pipeline([('vectorizer', vect),
                      ('dense_transformer', DenseTransformer()),
                      ('classifier', lp_model)])
lin_model.set_params(vectorizer__ngram_range = (1,3),vectorizer__analyzer = 'word', vectorizer__stop_words = 'english',
                     vectorizer__max_features = 800,
                     vectorizer__min_df = 2, vectorizer__max_df = 0.95)
feats = labeled.cleaned
labels = labeled.label_num
lin_model = lin_model.fit(feats, labels)
print('cross_val_score: %1.4f'% (np.mean(cross_val_score(lin_model, labeled.cleaned,labeled.label_num))))

cross_val_score: 0.2324


In [33]:
pd.Series(lin_model.named_steps['classifier'].transduction_).value_counts()

0    13630
3      564
5      375
1      166
2      104
4       14
dtype: int64

In [20]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
vect = CountVectorizer(ngram_range = (1,3),
                       analyzer = 'word',
                       stop_words = 'english',
                       max_features = 1000,
                       min_df = 2, max_df = 0.9).fit(total.Review)
feats = vect.transform(total.Review).toarray()
labels = total.label.as_matrix()

In [21]:
feats.shape

(13800, 1000)

In [36]:
from sklearn.semi_supervised import label_propagation
from scipy.sparse import csgraph
lp_model = label_propagation.LabelSpreading(kernel = 'rbf',gamma = 0.1, n_neighbors = 10,alpha = 0.2, max_iter=15)
lp_model.fit(feats, labels)

ValueError: could not convert string to float: 'i like the gun and the armor and skin gadget and everything # AW SOME'

In [49]:
lp_model.n_iter_

8

In [50]:
pd.Series(lp_model.transduction_).value_counts()

4    9804
0    2567
2     912
5     388
3      88
1      41
dtype: int64

In [35]:
from sklearn.metrics import f1_score,roc_auc_score,accuracy_score, classification_report

def eval_classifier(input_text,model):
    cleaned_text = clean_comment(input_text)
    feats = vect.transform([cleaned_text])
    class_names = ['Other',
        'Balance',
        'Graphics',
        'Bug',
        'Advertising',
        'Monetization']
    prediction = model.predict(feats.toarray())
    #print(class_names[prediction[0]])
    return prediction[0]
def eval_pipeline(input_text, model = lin_model):
    cleaned_text = clean_comment(input_text)
    class_names = ['Other',
        'Balance',
        'Graphics',
        'Bug',
        'Advertising',
        'Monetization']
    prediction = model.predict([cleaned_text])
    #print(class_names[prediction[0]])
    return prediction[0]
def val_score(model):
    class_names = ['Other',
        'Balance',
        'Graphics',
        'Bug',
        'Advertising',
        'Monetization']   
    val_en = pd.read_excel('temp data/val google play 2.xlsx')
    classes_nums = {
        'Balance':1,
        'Graphics':2,
        'Bug':3,
        'Advertising':4,
        'Monetization':5,
        'Other':0
    }    
    #val_en['label_num'] = val_en.Label.map(classes_nums)
    y_true = val_en.Label
    y_pred = val_en.Review.apply(eval_pipeline)
    val_en['predicted'] = y_pred
    val_en.to_excel('predicted.xlsx')
    print(classification_report(y_true, y_pred, target_names = class_names))
    print('model accuracy %1.4f'%(accuracy_score(y_true, y_pred)))
    return y_true,y_pred
y_true,y_pred = val_score(lin_model)

              precision    recall  f1-score   support

       Other       0.17      1.00      0.29         5
     Balance       0.00      0.00      0.00         5
    Graphics       0.00      0.00      0.00         5
         Bug       0.00      0.00      0.00         5
 Advertising       0.00      0.00      0.00         5
Monetization       1.00      0.20      0.33         5

 avg / total       0.20      0.20      0.10        30

model accuracy 0.2000


  'precision', 'predicted', average, warn_for)
