In [1]:
import pandas as pd
import numpy as np
from langdetect import detect
import re

In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
comments = pd.read_csv('../data/comments_lang.csv')
comments_en = comments[comments.lang == 'en']
vect = CountVectorizer(ngram_range = (1,1), analyzer = 'word',
                       stop_words = 'english',
                       max_features = 10000,
                       min_df = 2, max_df = 0.95).fit(comments_en.Review)

In [3]:
pw = list(vect.vocabulary_.keys())

In [4]:
import enchant
c = enchant.Dict("en_UK")
def check_spelling(text):
    if not c.check(text):
        suggestions = list(set(c.suggest(text)).intersection(set(pw)))
        if len(suggestions)>0:
            res = suggestions[0]
        else:
            res = text
    else:
        res = text
    return res

In [12]:
check_spelling('gud')

'god'

In [6]:
labeled1 = pd.read_excel('manual_labels.xlsx')
labeled2 = pd.read_excel('manual_labels2.xlsx')
labeled3 = pd.read_excel('manual_labels3.xlsx')
labeled4 = pd.read_excel('manual_labels4.xlsx')
labeled5 = pd.read_excel('manual_labels5.xlsx')

In [7]:
labeled5.label.value_counts()

4    621
2    172
5     89
0     82
3     25
1     11
Name: label, dtype: int64

In [8]:
labeled2.shape

(1000, 5)

In [9]:
labeled1.label.value_counts()

4    624
2    203
5     74
0     60
3     31
1      8
Name: label, dtype: int64

In [10]:
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re
def clean_comment(text):
    wnl = WordNetLemmatizer()
    deacc = re.sub(r'\W',' ', text)
    tokens = word_tokenize(deacc)
    res = ''
    for t in tokens:
        res += wnl.lemmatize(t)+' '
    return res
def get_tokens(text):
    wnl = WordNetLemmatizer()
    deacc = re.sub(r'\W',' ', text)
    tokens = word_tokenize(deacc)
    return tokens

In [13]:
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re
import nltk
from nltk.corpus import wordnet
from emoji.unicode_codes import UNICODE_EMOJI
import emoji
def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return ''

def clean_comment(text):
    wnl = WordNetLemmatizer()
    deacc = re.sub(r'\!',' exclamation_point ', text)
    tokens = word_tokenize(deacc)
    tags = nltk.pos_tag(tokens)
    processed = []
    for (word, tag) in tags:
        wn_tag = get_wordnet_pos(tag)
        if wn_tag!='':
            processed.append(wnl.lemmatize(word,wn_tag))
        else:
            processed.append(wnl.lemmatize(check_spelling(word)))
    res = ' '.join(processed)
    return res

In [14]:
labeled = pd.concat([labeled1, labeled2, labeled3, labeled4, labeled5],axis = 0)

In [15]:
labeled.loc[:,'cleaned'] = labeled.Review.apply(clean_comment)

In [16]:
labeled['tokens'] = labeled.Review.apply(get_tokens)

In [17]:
labeled_long = labeled[labeled.tokens.apply(len)>6]
labeled_neg = labeled[labeled.label!=4]

In [18]:
#comments_en['cleaned'] = comments_en.Review.apply(clean_comment)

In [19]:
#comments_en.to_csv('comments_en_cleaned.csv')

In [20]:
comments_en = pd.read_csv('comments_en_cleaned.csv')

In [21]:
from keras.preprocessing.text import Tokenizer
from keras.layers import TimeDistributed
from keras.models import load_model
import re
import keras.backend as K
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.callbacks import ModelCheckpoint
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from sklearn.model_selection import train_test_split
from keras import regularizers
from keras.layers.core import Dense, Dropout, Activation
from keras.layers import Flatten
from keras.preprocessing import sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from sklearn.utils.class_weight import compute_class_weight
from keras.layers.recurrent import LSTM

Using TensorFlow backend.


In [23]:
t = Tokenizer()

#rds = corpus[rd.tokens.apply(len)>5]
t = Tokenizer()
t.fit_on_texts(comments_en.cleaned.tolist())
vocab_size = len(t.word_index) + 1

In [25]:
encoded_docs = t.texts_to_sequences(labeled.cleaned)
max_length = labeled.tokens.apply(len).max()
feats = encoded_docs
labels = to_categorical(labeled.label)
X_train, X_test, y_train, y_test = train_test_split(feats, labels, test_size=0.2)
X_train = sequence.pad_sequences(X_train, maxlen=max_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_length)

In [26]:
model = Sequential()

model.add(Embedding(vocab_size, 150, 
                    input_length=max_length,
                   embeddings_regularizer = regularizers.l2(1e-3)))
model.add(Dropout(0.2))
model.add(Conv1D(filters=100, kernel_size=25, padding='same', activation='sigmoid'))
model.add(Conv1D(filters=25, kernel_size=25, padding='same', activation='sigmoid'))
model.add(MaxPooling1D(pool_size=5))
#model.add(Flatten())
model.add(Dropout(0.2))
model.add(LSTM(10, activation = 'relu'))
model.add(Dropout(0.2))
model.add(Dense(6, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy'])

In [27]:
class_weight = compute_class_weight('balanced'
                                               ,[0,1,2,3,4,5]
                                               ,labeled.label.apply(int).tolist())

In [None]:
checkpointer = ModelCheckpoint(filepath='weights.hdf5', verbose=1, save_best_only=True)
model.fit(X_train, y_train, epochs=200, batch_size=500,
          validation_data = [X_test,y_test],
          callbacks=[checkpointer],class_weight = class_weight)

Train on 4000 samples, validate on 1000 samples
Epoch 1/200
Epoch 00001: val_loss improved from inf to 2.42423, saving model to weights.hdf5
Epoch 2/200
Epoch 00002: val_loss improved from 2.42423 to 1.87397, saving model to weights.hdf5
Epoch 3/200
Epoch 00003: val_loss improved from 1.87397 to 1.58254, saving model to weights.hdf5
Epoch 4/200
Epoch 00004: val_loss improved from 1.58254 to 1.41804, saving model to weights.hdf5
Epoch 5/200
Epoch 00005: val_loss improved from 1.41804 to 1.32462, saving model to weights.hdf5
Epoch 6/200
Epoch 00006: val_loss improved from 1.32462 to 1.28438, saving model to weights.hdf5
Epoch 7/200
Epoch 00007: val_loss improved from 1.28438 to 1.26350, saving model to weights.hdf5
Epoch 8/200
Epoch 00008: val_loss improved from 1.26350 to 1.25426, saving model to weights.hdf5
Epoch 9/200
Epoch 00009: val_loss improved from 1.25426 to 1.24974, saving model to weights.hdf5
Epoch 10/200
Epoch 00010: val_loss did not improve
Epoch 11/200
Epoch 00011: val_lo

Epoch 00024: val_loss improved from 1.21925 to 1.21678, saving model to weights.hdf5
Epoch 25/200
Epoch 00025: val_loss improved from 1.21678 to 1.21289, saving model to weights.hdf5
Epoch 26/200
Epoch 00026: val_loss improved from 1.21289 to 1.20979, saving model to weights.hdf5
Epoch 27/200
Epoch 00027: val_loss improved from 1.20979 to 1.20294, saving model to weights.hdf5
Epoch 28/200
Epoch 00028: val_loss improved from 1.20294 to 1.19495, saving model to weights.hdf5
Epoch 29/200
Epoch 00029: val_loss improved from 1.19495 to 1.17783, saving model to weights.hdf5
Epoch 30/200
Epoch 00030: val_loss improved from 1.17783 to 1.15557, saving model to weights.hdf5
Epoch 31/200
Epoch 00031: val_loss improved from 1.15557 to 1.13511, saving model to weights.hdf5
Epoch 32/200
Epoch 00032: val_loss improved from 1.13511 to 1.12212, saving model to weights.hdf5
Epoch 33/200
Epoch 00033: val_loss improved from 1.12212 to 1.05783, saving model to weights.hdf5
Epoch 34/200
Epoch 00034: val_los

Epoch 48/200
Epoch 00048: val_loss did not improve
Epoch 49/200
Epoch 00049: val_loss did not improve
Epoch 50/200
Epoch 00050: val_loss did not improve
Epoch 51/200
Epoch 00051: val_loss did not improve
Epoch 52/200
Epoch 00052: val_loss did not improve
Epoch 53/200
Epoch 00053: val_loss did not improve
Epoch 54/200
Epoch 00054: val_loss did not improve
Epoch 55/200
Epoch 00055: val_loss did not improve
Epoch 56/200
Epoch 00056: val_loss did not improve
Epoch 57/200
Epoch 00057: val_loss did not improve
Epoch 58/200
Epoch 00058: val_loss did not improve
Epoch 59/200
Epoch 00059: val_loss did not improve
Epoch 60/200
Epoch 00060: val_loss did not improve
Epoch 61/200
Epoch 00061: val_loss did not improve
Epoch 62/200
Epoch 00062: val_loss did not improve
Epoch 63/200
Epoch 00063: val_loss did not improve
Epoch 64/200
Epoch 00064: val_loss did not improve
Epoch 65/200
Epoch 00065: val_loss did not improve
Epoch 66/200
Epoch 00066: val_loss did not improve
Epoch 67/200
Epoch 00067: val_l

Epoch 75/200
Epoch 00075: val_loss did not improve
Epoch 76/200
Epoch 00076: val_loss did not improve
Epoch 77/200
Epoch 00077: val_loss did not improve
Epoch 78/200
Epoch 00078: val_loss did not improve
Epoch 79/200
Epoch 00079: val_loss did not improve
Epoch 80/200
Epoch 00080: val_loss did not improve
Epoch 81/200
Epoch 00081: val_loss did not improve
Epoch 82/200
Epoch 00082: val_loss did not improve
Epoch 83/200
Epoch 00083: val_loss did not improve
Epoch 84/200
Epoch 00084: val_loss did not improve
Epoch 85/200
Epoch 00085: val_loss did not improve
Epoch 86/200
Epoch 00086: val_loss did not improve
Epoch 87/200
Epoch 00087: val_loss did not improve
Epoch 88/200
Epoch 00088: val_loss did not improve
Epoch 89/200
Epoch 00089: val_loss did not improve
Epoch 90/200
Epoch 00090: val_loss did not improve
Epoch 91/200
Epoch 00091: val_loss did not improve
Epoch 92/200
Epoch 00092: val_loss did not improve
Epoch 93/200
Epoch 00093: val_loss did not improve
Epoch 94/200
Epoch 00094: val_l

Epoch 102/200
Epoch 00102: val_loss did not improve
Epoch 103/200
Epoch 00103: val_loss did not improve
Epoch 104/200
Epoch 00104: val_loss did not improve
Epoch 105/200
Epoch 00105: val_loss did not improve
Epoch 106/200
Epoch 00106: val_loss did not improve
Epoch 107/200
Epoch 00107: val_loss did not improve
Epoch 108/200
Epoch 00108: val_loss did not improve
Epoch 109/200
Epoch 00109: val_loss did not improve
Epoch 110/200
Epoch 00110: val_loss did not improve
Epoch 111/200
Epoch 00111: val_loss did not improve
Epoch 112/200
Epoch 00112: val_loss did not improve
Epoch 113/200
Epoch 00113: val_loss did not improve
Epoch 114/200
Epoch 00114: val_loss did not improve
Epoch 115/200
Epoch 00115: val_loss did not improve
Epoch 116/200
Epoch 00116: val_loss did not improve
Epoch 117/200
Epoch 00117: val_loss did not improve
Epoch 118/200
Epoch 00118: val_loss did not improve
Epoch 119/200
Epoch 00119: val_loss did not improve
Epoch 120/200
Epoch 00120: val_loss did not improve
Epoch 121/20

In [132]:
def eval_model(y_train,y_test,y_train_pred,y_test_pred):
    
    class_names = ['unknown',
        'Crash',
        'Balance problems',
        'Synchronization',
        'Positive',
        'Bug']
    
    class_names_b = ['neg', 'pos']
    print('train scores\n')
    print(classification_report(y_train, y_train_pred, target_names = class_names))
    print('test scores\n')
    print(classification_report(y_test, y_test_pred, target_names = class_names))

In [141]:
from sklearn.metrics import f1_score,roc_auc_score,accuracy_score, classification_report
def eval_network(input_text, model = model):
    cleaned_text = clean_comment(input_text)
    class_names = ['Other',
        'Crash',
        'Balance problems',
        'Synchronization',
        'Positive',
        'Bug']
    seq = t.texts_to_sequences([cleaned_text])
    padded_sequence = sequence.pad_sequences(seq, maxlen=max_length)
    prediction = model.predict(padded_sequence)
    #print(class_names[prediction[0]])
    return np.argmax(class_weight*prediction[0])

def val_score(model):
    class_names = ['Other',
        'Crash',
        'Balance problems',
        'Synchronization',
        'Positive',
        'Bug']    
    val_en = pd.read_excel('validation_en.xlsx')
    y_true = []
    y_pred = []
    for i in range(0,6):
        y_true.append([i]*10)
        y_pred.append(val_en.iloc[:,i].apply(eval_network))
    y_true = np.array(y_true).flatten()
    y_pred = np.array(y_pred).flatten()
    print(classification_report(y_true, y_pred, target_names = class_names))
    print('model accuracy %1.4f'%(accuracy_score(y_true, y_pred)))
    return y_true,y_pred
y_true,y_pred = val_score(model)

                  precision    recall  f1-score   support

           Other       0.80      0.80      0.80        10
           Crash       0.33      0.20      0.25        10
Balance problems       0.00      0.00      0.00        10
 Synchronization       0.29      0.80      0.42        10
        Positive       0.60      0.60      0.60        10
             Bug       0.00      0.00      0.00        10

     avg / total       0.34      0.40      0.35        60

model accuracy 0.4000


  'precision', 'predicted', average, warn_for)


In [142]:
eval_network('this dum gaem crashes every time i launch it')

5

In [143]:
eval_network('those new weapons are so dam op')

1

In [144]:
eval_network('The game glitched and all of my trophies and guns are now lost')

1

In [145]:
eval_network('Cool!')

4

In [146]:
eval_network('This game is haard to control')

0

In [147]:
eval_network('Mucho gusto')

0