In [1]:
import pandas as pd
import numpy as np

In [3]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
comments = pd.read_csv('../comments_en_cleaned.csv')
comments_en = comments[comments.lang == 'en']
vect = CountVectorizer(ngram_range = (1,1), analyzer = 'word',
                       stop_words = 'english',
                       max_features = 500,
                       min_df = 2, max_df = 0.95).fit(comments_en.Review)
pw = list(vect.vocabulary_.keys())

In [4]:
import enchant
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re
import nltk
from nltk.corpus import wordnet
from emoji.unicode_codes import UNICODE_EMOJI
import emoji

c = enchant.Dict("en_UK")
def check_spelling(text):
    if not c.check(text):
        suggestions = list(set(c.suggest(text)).intersection(set(pw)))
        if len(suggestions)>0:
            res = suggestions[0]
        else:
            res = text
    else:
        res = text
    return res

def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return ''

def clean_comment(text):
    wnl = WordNetLemmatizer()
    deacc = re.sub(r'\!',' exclamation_point ', text)
    tokens = word_tokenize(deacc)
    tags = nltk.pos_tag(tokens)
    processed = []
    for (word, tag) in tags:
        wn_tag = get_wordnet_pos(tag)
        if wn_tag!='':
            processed.append(wnl.lemmatize(word,wn_tag))
        else:
            processed.append(wnl.lemmatize(check_spelling(word)))
    res = ' '.join(processed)
    return res

In [5]:
labeled = pd.read_excel('kw_classified.xlsx')

In [6]:
labeled_other = labeled[labeled.kw_label==0]
labeled_balance = labeled[labeled.kw_label==1]
labeled_graphics = labeled[labeled.kw_label==2]
labeled_bug = labeled[labeled.kw_label==3]
labeled_ads = labeled[labeled.kw_label==4]
labeled_money = labeled[labeled.kw_label==5]

In [9]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

def get_top_words(df, filename):
    vect = CountVectorizer(ngram_range = (1,1), analyzer = 'word',
                           stop_words = 'english',
                           max_features = 50,
                           min_df = 2, max_df = 0.95)
    vectors = vect.fit_transform(df.cleaned)
    z = zip(vect.get_feature_names(),
        np.asarray(vectors.sum(axis=0)*1000/vectors.shape[0]).ravel())
    freqs = pd.Series()
    for fn,cnt in z:
        freqs[fn] = cnt
    freqs.sort_values(ascending = False).to_excel(filename+'.xlsx')
    return freqs.sort_values(ascending = False)

In [10]:
other_freqs = get_top_words(labeled_other, 'other_topwords')
balance_freqs = get_top_words(labeled_balance, 'balance_topwords')
graphics_freqs = get_top_words(labeled_graphics, 'graphics_topwords')
bug_freqs = get_top_words(labeled_bug, 'bug_topwords')
ads_freqs = get_top_words(labeled_ads, 'ads_topwords')
money_freqs = get_top_words(labeled_money, 'money_topwords')

In [14]:
from nltk.tokenize import TreebankWordTokenizer
def check4word(w, freqs):
    if w in freqs.index:
        return freqs.loc[w].tolist()[0]
    else:
        return 0
def get_cat_by_kw(text):
    tokenizer = TreebankWordTokenizer()
    cleaned = clean_comment(text)
    tokens         = tokenizer.tokenize(cleaned)
    cat_freqs      = pd.DataFrame(columns = tokens)
    other_freqs    = pd.read_excel('it1/other_topwords.xlsx')
    balance_freqs  = pd.read_excel('it1/balance_topwords.xlsx')
    graphics_freqs = pd.read_excel('it1/graphics_topwords.xlsx')
    bug_freqs = pd.read_excel('it1/bug_topwords.xlsx')
    ads_freqs = pd.read_excel('it1/ads_topwords.xlsx')
    money_freqs = pd.read_excel('it1/money_topwords.xlsx')
    classes_nums = {
        'Balance':1,
        'Graphics':2,
        'Bug':3,
        'Advertising':4,
        'Monetization':5,
        'Other':0
    }    
    for w in tokens:
        cat_freqs.loc['Other',w] = check4word(w,other_freqs)
        cat_freqs.loc['Balance',w] = check4word(w,balance_freqs)
        cat_freqs.loc['Graphics',w] = check4word(w,graphics_freqs)
        cat_freqs.loc['Bug',w] = check4word(w,bug_freqs)
        cat_freqs.loc['Advertising',w] = check4word(w,ads_freqs)
        cat_freqs.loc['Monetization',w] = check4word(w,money_freqs)
    return classes_nums[cat_freqs.apply(sum,axis =1).idxmax()]
    #return cat_freqs#.apply(sum,axis =1).idxmax()

In [15]:
get_cat_by_kw('The game is so unfair when i level up some higher than me pops up i die all the time you earn money and you cant fix this ')

1

In [16]:
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score,roc_auc_score,accuracy_score

def val_score():
    class_names = ['Other',
        'Balance',
        'Graphics',
        'Bug',
        'Advertising',
        'Monetization']   
    val_en = pd.read_excel('temp data/val google play 2.xlsx')
    classes_nums = {
        'Balance':1,
        'Graphics':2,
        'Bug':3,
        'Advertising':4,
        'Monetization':5,
        'Other':0
    }    
    #val_en['label_num'] = val_en.Label.map(classes_nums)
    y_true = val_en.Label
    y_pred = val_en.Review.apply(get_cat_by_kw)
    val_en['predicted'] = y_pred
    val_en.to_excel('predicted.xlsx')
    print(classification_report(y_true, y_pred, target_names = class_names))
    print('model accuracy %1.4f'%(accuracy_score(y_true, y_pred)))
    return y_true,y_pred
y_true,y_pred = val_score()

              precision    recall  f1-score   support

       Other       0.45      1.00      0.62         5
     Balance       0.33      0.40      0.36         5
    Graphics       1.00      0.80      0.89         5
         Bug       1.00      0.60      0.75         5
 Advertising       1.00      0.60      0.75         5
Monetization       0.67      0.40      0.50         5

 avg / total       0.74      0.63      0.65        30

model accuracy 0.6333


In [17]:
comments = pd.read_csv('comments_en_cleaned.csv')
comments_en = comments[comments.lang == 'en']

In [18]:
kw_labels = comments_en.cleaned.apply(get_cat_by_kw)

In [11]:
comments_en['kw_label'] = kw_labels

In [12]:
comments_en.to_excel('kw_labels_it0.xlsx')