In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
comments = pd.read_csv('../comments_en_cleaned.csv')
comments_en = comments[comments.lang == 'en']
vect = CountVectorizer(ngram_range = (1,1), analyzer = 'word',
                       stop_words = 'english',
                       max_features = 500,
                       min_df = 2, max_df = 0.95).fit(comments_en.Review)
pw = list(vect.vocabulary_.keys())

In [3]:
labeled4 = pd.read_excel('../temp data/for_labeling 4.xlsx').loc[:,['Review', 'Label', 'Sublabel']]
labeled1 = pd.read_excel('../temp data/for_labeling 1.xlsx').loc[:,['Review', 'Label', 'Sublabel']]
labeled2 = pd.read_excel('../temp data/for_labeling 2.xlsx').loc[:,['Review', 'Label', 'Sublabel']]

In [4]:
labeled2.Sublabel.value_counts()

3.0    49
1.0    24
2.0    11
Name: Sublabel, dtype: int64

In [5]:
labeled1.Sublabel = labeled1.Sublabel.apply(lambda x: str(x).lower())
labeled4.Sublabel = labeled4.Sublabel.apply(lambda x: str(x).lower())

In [6]:
labeled4.Sublabel.value_counts()

nan                 4955
combat balance        24
matchmaking           17
gameplay balance      11
Name: Sublabel, dtype: int64

In [8]:
classes_nums = {
    'Balance':1,
    'Graphics':2,
    'Bug':3,
    'Advertising':4,
    'Monetization':5,
    'Other':0
}
subclasses_nums = {
    'combat balance': 1,
    'gameplay balance':2,
    'matchmaking':3
}
labeled1['sublabel_num'] = labeled1.Sublabel.map(subclasses_nums)
labeled4['sublabel_num'] = labeled4.Sublabel.map(subclasses_nums)
labeled2['sublabel_num'] = labeled2.Sublabel

In [9]:
labeled2.sublabel_num.value_counts()

3.0    49
1.0    24
2.0    11
Name: sublabel_num, dtype: int64

In [10]:

labeled1['label_num'] = labeled1.Label.map(classes_nums)
labeled4['label_num'] = labeled4.Label.map(classes_nums)

labeled1['sublabel_num'] = labeled1.Sublabel.map(subclasses_nums)
labeled4['sublabel_num'] = labeled4.Sublabel.map(subclasses_nums)

labeled2['label_num'] = labeled2.Label


In [11]:
labeled4[labeled4.label_num==1].shape

(53, 5)

In [12]:
labeled4.Sublabel.value_counts()

nan                 4955
combat balance        24
matchmaking           17
gameplay balance      11
Name: Sublabel, dtype: int64

In [13]:
labeled4.sublabel_num.value_counts()

1.0    24
3.0    17
2.0    11
Name: sublabel_num, dtype: int64

In [14]:
labeled = pd.concat([labeled4, labeled2, labeled1], axis = 0)
labeled = labeled.dropna(axis = 0)

In [15]:
labeled.sublabel_num.value_counts()

1.0    73
3.0    69
2.0    22
Name: sublabel_num, dtype: int64

In [16]:
unlabeled = pd.read_excel('model_labeled.xlsx')

In [18]:
unlabeled_balance = unlabeled[unlabeled.model_label==1]

In [19]:
unlabeled_balance.shape

(830, 7)

In [20]:
import enchant
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize,TreebankWordTokenizer
import re
import nltk
from nltk.corpus import wordnet
from emoji.unicode_codes import UNICODE_EMOJI
import emoji

c = enchant.Dict("en_UK")
def check_spelling(text):
    if not c.check(text):
        suggestions = list(set(c.suggest(text)).intersection(set(pw)))
        if len(suggestions)>0:
            res = suggestions[0]
        elif len(c.suggest(text))>0:
            res = c.suggest(text)[0]
        else:
            res = text
    else:
        res = text
    return res

def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return ''

def clean_comment(text):
    wnl = WordNetLemmatizer()
    tokenizer = TreebankWordTokenizer()
    #deacc = re.sub(r'\!',' exclamation_point ', text)
    tokens = tokenizer.tokenize(text)
    tags = nltk.pos_tag(tokens)
    processed = []
    for (word, tag) in tags:
        wn_tag = get_wordnet_pos(tag)
        if wn_tag!='':
            processed.append(wnl.lemmatize(check_spelling(word),wn_tag))
        else:
            processed.append(wnl.lemmatize(check_spelling(word)))
    res = ' '.join(processed)
    return res.lower()

In [116]:
labeled.loc[:,'cleaned'] = labeled.Review.apply(clean_comment)

In [117]:
labeled_combat = labeled[labeled.sublabel_num==1]
labeled_gameplay = labeled[labeled.sublabel_num==2]
labeled_matchmaking = labeled[labeled.sublabel_num==3]

In [38]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

def get_top_words(df, filename):
    vect = CountVectorizer(ngram_range = (1,1), analyzer = 'word',
                           stop_words = 'english',
                           #max_features = 200,
                           min_df = 2, max_df = 0.95)
    vectors = vect.fit_transform(df.cleaned)
    z = zip(vect.get_feature_names(),
        np.asarray(vectors.sum(axis=0)*1000/vectors.shape[0]).ravel())
    freqs = pd.Series()
    for fn,cnt in z:
        freqs[fn] = cnt
    freqs.sort_values(ascending = False).to_excel(filename+'.xlsx')
    return freqs.sort_values(ascending = False)

In [22]:
#combat_freqs = get_top_words(labeled_combat, 'combat_topwords')
#gameplay_freqs = get_top_words(labeled_gameplay, 'gameplay_topwords')
#matchmaking_freqs = get_top_words(labeled_matchmaking, 'matchmaking_topwords')

In [23]:
from nltk.tokenize import TreebankWordTokenizer
def check4word(w, freqs):
    if w in freqs.index:
        return freqs.loc[w].tolist()[0]
    else:
        return 0
def get_cat_by_kw(text):
    tokenizer = TreebankWordTokenizer()
    cleaned = clean_comment(text)
    tokens         = tokenizer.tokenize(cleaned)
    cat_freqs      = pd.DataFrame(columns = tokens)
    combat_freqs    = pd.read_excel('combat_topwords.xlsx')
    gameplay_freqs  = pd.read_excel('gameplay_topwords.xlsx')
    matchmaking_freqs = pd.read_excel('matchmaking_topwords.xlsx')
    classes_nums = {
        'Combat Balance':1,
        'Gameplay Balance':2,
        'Matchmaking':3,
        'Other':0
    }    
    for w in tokens:
        #cat_freqs.loc['Other',w] = check4word(w,other_freqs)
        cat_freqs.loc['Combat Balance',w] = check4word(w,combat_freqs)
        cat_freqs.loc['Gameplay Balance',w] = check4word(w,gameplay_freqs)
        cat_freqs.loc['Matchmaking',w] = check4word(w,matchmaking_freqs)
    if cat_freqs.apply(sum).sum()==0:
        return 0
    else:
        return classes_nums[cat_freqs.apply(sum,axis =1).idxmax()]
    return cat_freqs#.apply(sum,axis =1).idxmax()

In [24]:
get_cat_by_kw('Matchmaking is unfair')

3

In [27]:
unlabeled_balance['sublabel'] = unlabeled_balance.loc[:,'Review'].apply(get_cat_by_kw)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [28]:
unlabeled_balance.sublabel.value_counts()

1    382
2    216
0    133
3     99
Name: sublabel, dtype: int64

In [33]:
labeled_combat = unlabeled_balance[unlabeled_balance.sublabel==1]
labeled_gameplay = unlabeled_balance[unlabeled_balance.sublabel==2]
labeled_matchmaking = unlabeled_balance[unlabeled_balance.sublabel==3]

In [39]:
combat_freqs = get_top_words(labeled_combat, 'it1/combat_topwords')
gameplay_freqs = get_top_words(labeled_gameplay, 'it1/gameplay_topwords')
matchmaking_freqs = get_top_words(labeled_matchmaking, 'it1/matchmaking_topwords')

In [41]:
from nltk.tokenize import TreebankWordTokenizer
def check4word(w, freqs):
    if w in freqs.index:
        return freqs.loc[w].tolist()[0]
    else:
        return 0
def get_cat_by_kw(text):
    tokenizer = TreebankWordTokenizer()
    cleaned = clean_comment(text)
    tokens         = tokenizer.tokenize(cleaned)
    cat_freqs      = pd.DataFrame(columns = tokens)
    combat_freqs    = pd.read_excel('it1/combat_topwords.xlsx')
    gameplay_freqs  = pd.read_excel('it1/gameplay_topwords.xlsx')
    matchmaking_freqs = pd.read_excel('it1/matchmaking_topwords.xlsx')
    classes_nums = {
        'Combat Balance':1,
        'Gameplay Balance':2,
        'Matchmaking':3,
        'Other':0
    }    
    for w in tokens:
        #cat_freqs.loc['Other',w] = check4word(w,other_freqs)
        cat_freqs.loc['Combat Balance',w] = check4word(w,combat_freqs)
        cat_freqs.loc['Gameplay Balance',w] = check4word(w,gameplay_freqs)
        cat_freqs.loc['Matchmaking',w] = check4word(w,matchmaking_freqs)
    if cat_freqs.apply(sum).sum()==0:
        return 0
    else:
        return classes_nums[cat_freqs.apply(sum,axis =1).idxmax()]
    return cat_freqs#.apply(sum,axis =1).idxmax()

In [56]:
get_cat_by_kw(' Can you pls nerfed the primary weapons like mega gun, and anything because some people killing me with mega gun... And we are not rich, we are poor! Other players are stealing their mom\'s credit but we didn\'t do that... All the guns are high price?!! What?!! This is not even a rich man game, this is PIXEL GUN!!! And i want just to say.... I haven\'t a armory button... Can you pls give me I\'m not even happy when I started to play pg3d... Pls give me... ')

1

In [55]:
 get_cat_by_kw('Great game! But its just the lucky chest! Please make it so we dont get coins and gems everytime! Put more guns and gadgets or make it so we get 50 gems instead of 3 5 and 10 ')

1

In [54]:
 get_cat_by_kw(' This is amazing, I can\'t stop playing. So many different weapons, gadgets, and game modes that is basically Minecraft in a shooting game but better! ')

1

In [53]:
get_cat_by_kw(' Piss poor matchmaking is what is scaring me away from this game. That is literally the only reason why I would not recommend anyone to play this. The grind to stand a chance against the veterans you will be matched with frequently is not worth it. It gets 2 stars for overall gameplay. ')

3

In [57]:
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score,roc_auc_score,accuracy_score

def val_score():
    class_names = ['Other',
        'Combat',
        'Gameplay',
        'Matchmaking']   
    val_en = pd.read_excel('../temp data/val google play 2.xlsx')
    classes_nums = {
        'Combat':1,
        'Gameplay':2,
        'Matchmaking':3,
        'Other':0
    }    
    #val_en['label_num'] = val_en.Label.map(classes_nums)
    y_true = val_en.Label
    y_pred = val_en.Review.apply(get_cat_by_kw)
    val_en['predicted'] = y_pred
    val_en.to_excel('predicted.xlsx')
    print(classification_report(y_true, y_pred, target_names = class_names))
    print('model accuracy %1.4f'%(accuracy_score(y_true, y_pred)))
    return y_true,y_pred
y_true,y_pred = val_score()

             precision    recall  f1-score   support

      Other       0.27      0.80      0.40         5
     Combat       0.10      0.20      0.13         5
   Gameplay       0.00      0.00      0.00         5
Matchmaking       0.20      0.20      0.20         5

avg / total       0.09      0.20      0.12        30

model accuracy 0.2000


  .format(len(labels), len(target_names))
  'precision', 'predicted', average, warn_for)
