In [1]:
import pandas as pd
import numpy as np
from langdetect import detect
import re

In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
comments = pd.read_csv('../data/comments_lang.csv')
comments_en = comments[comments.lang == 'en']
vect = CountVectorizer(ngram_range = (1,1), analyzer = 'word',
                       stop_words = 'english',
                       max_features = 10000,
                       min_df = 2, max_df = 0.95).fit(comments_en.Review)

In [3]:
pw = list(vect.vocabulary_.keys())

In [4]:
import enchant
c = enchant.Dict("en_UK")
def check_spelling(text):
    if not c.check(text):
        suggestions = list(set(c.suggest(text)).intersection(set(pw)))
        if len(suggestions)>0:
            res = suggestions[0]
        else:
            res = text
    else:
        res = text
    return res

In [5]:
check_spelling('stu pid')

'stupid'

In [6]:
labeled1 = pd.read_excel('manual_labels.xlsx')
labeled2 = pd.read_excel('manual_labels2.xlsx')
labeled3 = pd.read_excel('manual_labels3.xlsx')
labeled4 = pd.read_excel('manual_labels4.xlsx')
labeled5 = pd.read_excel('manual_labels5.xlsx')

In [7]:
labeled5.label.value_counts()

4    621
2    172
5     89
0     82
3     25
1     11
Name: label, dtype: int64

In [8]:
labeled2.shape

(1000, 5)

In [9]:
labeled1.label.value_counts()

4    624
2    203
5     74
0     60
3     31
1      8
Name: label, dtype: int64

In [10]:
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re
def clean_comment(text):
    wnl = WordNetLemmatizer()
    deacc = re.sub(r'\W',' ', text)
    tokens = word_tokenize(deacc)
    res = ''
    for t in tokens:
        res += wnl.lemmatize(t)+' '
    return res
def get_tokens(text):
    wnl = WordNetLemmatizer()
    deacc = re.sub(r'\W',' ', text)
    tokens = word_tokenize(deacc)
    return tokens

In [11]:
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re
import nltk
from nltk.corpus import wordnet
from emoji.unicode_codes import UNICODE_EMOJI
import emoji
def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return ''

def clean_comment(text):
    wnl = WordNetLemmatizer()
    deacc = re.sub(r'\!',' exclamation_point ', text)
    tokens = word_tokenize(deacc)
    tags = nltk.pos_tag(tokens)
    processed = []
    for (word, tag) in tags:
        wn_tag = get_wordnet_pos(tag)
        if wn_tag!='':
            processed.append(wnl.lemmatize(word,wn_tag))
        else:
            processed.append(wnl.lemmatize(check_spelling(word)))
    res = ' '.join(processed)
    return res

In [12]:
labeled = pd.concat([labeled1, labeled2, labeled3, labeled4, labeled5],axis = 0)

In [13]:
labeled.loc[:,'cleaned'] = labeled.Review.apply(clean_comment)

In [14]:
labeled['tokens'] = labeled.Review.apply(get_tokens)

In [15]:
labeled_long = labeled[labeled.tokens.apply(len)>6]
labeled_neg = labeled[labeled.label!=4]

In [16]:
#comments_en['cleaned'] = comments_en.Review.apply(clean_comment)

In [17]:
#comments_en.to_csv('comments_en_cleaned.csv')

In [18]:
comments_en = pd.read_csv('comments_en_cleaned.csv')

In [19]:
from keras.preprocessing.text import Tokenizer
from keras.layers import TimeDistributed
from keras.models import load_model
import re
import keras.backend as K
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.callbacks import ModelCheckpoint
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from sklearn.model_selection import train_test_split
from keras import regularizers
from keras.layers.core import Dense, Dropout, Activation
from keras.layers import Flatten
from keras.preprocessing import sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from sklearn.utils.class_weight import compute_class_weight
from keras.layers.recurrent import LSTM

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [21]:
t = Tokenizer()

#rds = corpus[rd.tokens.apply(len)>5]
t = Tokenizer()
t.fit_on_texts(comments_en.cleaned.tolist())
vocab_size = len(t.word_index) + 1

In [22]:
encoded_docs = t.texts_to_sequences(labeled.cleaned)
max_length = labeled.tokens.apply(len).max()
feats = encoded_docs
labels = to_categorical(labeled.label)
X_train, X_test, y_train, y_test = train_test_split(feats, labels, test_size=0.2)
X_train = sequence.pad_sequences(X_train, maxlen=max_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_length)

In [129]:
model = Sequential()

model.add(Embedding(vocab_size, 150, 
                    input_length=max_length,
                   embeddings_regularizer = regularizers.l2(1e-4)))
model.add(Dropout(0.2))
model.add(Conv1D(filters=100, kernel_size=25, padding='same', activation='sigmoid'))
model.add(Conv1D(filters=25, kernel_size=25, padding='same', activation='sigmoid'))
model.add(MaxPooling1D(pool_size=5))
#model.add(Flatten())
model.add(Dropout(0.2))
model.add(LSTM(10, activation = 'relu'))
model.add(Dropout(0.2))
model.add(Dense(6, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy'])

In [130]:
class_weight = compute_class_weight('balanced'
                                               ,[0,1,2,3,4,5]
                                               ,labeled.label.apply(int).tolist())

In [131]:
checkpointer = ModelCheckpoint(filepath='weights.hdf5', verbose=1, save_best_only=True)
model.fit(X_train, y_train, epochs=50, batch_size=500,
          validation_data = [X_test,y_test],
          callbacks=[checkpointer],class_weight = class_weight)

Train on 4000 samples, validate on 1000 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50


Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x1c5557f8d0>

In [132]:
def eval_model(y_train,y_test,y_train_pred,y_test_pred):
    
    class_names = ['unknown',
        'Crash',
        'Balance problems',
        'Synchronization',
        'Positive',
        'Bug']
    
    class_names_b = ['neg', 'pos']
    print('train scores\n')
    print(classification_report(y_train, y_train_pred, target_names = class_names))
    print('test scores\n')
    print(classification_report(y_test, y_test_pred, target_names = class_names))

In [141]:
from sklearn.metrics import f1_score,roc_auc_score,accuracy_score, classification_report
def eval_network(input_text, model = model):
    cleaned_text = clean_comment(input_text)
    class_names = ['Other',
        'Crash',
        'Balance problems',
        'Synchronization',
        'Positive',
        'Bug']
    seq = t.texts_to_sequences([cleaned_text])
    padded_sequence = sequence.pad_sequences(seq, maxlen=max_length)
    prediction = model.predict(padded_sequence)
    #print(class_names[prediction[0]])
    return np.argmax(class_weight*prediction[0])

def val_score(model):
    class_names = ['Other',
        'Crash',
        'Balance problems',
        'Synchronization',
        'Positive',
        'Bug']    
    val_en = pd.read_excel('validation_en.xlsx')
    y_true = []
    y_pred = []
    for i in range(0,6):
        y_true.append([i]*10)
        y_pred.append(val_en.iloc[:,i].apply(eval_network))
    y_true = np.array(y_true).flatten()
    y_pred = np.array(y_pred).flatten()
    print(classification_report(y_true, y_pred, target_names = class_names))
    print('model accuracy %1.4f'%(accuracy_score(y_true, y_pred)))
    return y_true,y_pred
y_true,y_pred = val_score(model)

                  precision    recall  f1-score   support

           Other       0.80      0.80      0.80        10
           Crash       0.33      0.20      0.25        10
Balance problems       0.00      0.00      0.00        10
 Synchronization       0.29      0.80      0.42        10
        Positive       0.60      0.60      0.60        10
             Bug       0.00      0.00      0.00        10

     avg / total       0.34      0.40      0.35        60

model accuracy 0.4000


  'precision', 'predicted', average, warn_for)


In [142]:
eval_network('this dum gaem crashes every time i launch it')

5

In [143]:
eval_network('those new weapons are so dam op')

1

In [144]:
eval_network('The game glitched and all of my trophies and guns are now lost')

1

In [145]:
eval_network('Cool!')

4

In [146]:
eval_network('This game is haard to control')

0

In [147]:
eval_network('Mucho gusto')

0