In [5]:
import numpy as np
import pandas as pd
train = pd.read_csv("reddit_train.csv",index_col=0,usecols=[0,4,6])
test = pd.read_csv("reddit_test.csv",index_col=0,usecols=[0,4,6])

In [7]:
import re
import nltk

import string

try:
    nltk.download('stopwords')
except:
    pass

from nltk.corpus import stopwords

import snowballstemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Yidi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
from nltk.stem import WordNetLemmatizer

In [19]:
def find_pos(word):
    # Part of Speech constants
    # ADJ, ADJ_SAT, ADV, NOUN, VERB = 'a', 's', 'r', 'n', 'v'
    pos= nltk.pos_tag(nltk.word_tokenize(word))[0][1]
    # Adjective tags -'JJ', 'JJR', 'JJS'
    if pos.lower()[0] == 'j':
        return 'a'
    # Adverb tags -'RB', 'RBR', 'RBS'
    elif pos.lower()[0] == 'r':
        return 'r'
    # Verb tags -'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'
    elif pos.lower()[0] == 'v':
        return 'v'
    # Noun tags -'NN', 'NNS', 'NNP', 'NNPS'
    else:
        return 'n'

In [6]:
def words_lemmatizer(text, encoding="utf8"):
    words = nltk.word_tokenize(text)
    lemma_words = [] 
    wl= WordNetLemmatizer()
    for word in words:
        pos= find_pos(word)
        lemma_words.append(wl.lemmatize(word, pos))
    return " ".join(lemma_words)

In [20]:

def clean_text(text):
    text = text.replace("&#xD;", " ").replace("&#xA;", " ").replace("&amp;",'& ')
    ## Remove puncuation
    text = text.translate(string.punctuation)
    
    ## Convert words to lower case and split them
    text = text.lower().split()
    
    ## Remove stop words
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops]# and len(w) >= 3]
    
    text = " ".join(text)
    ## Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)

    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = re.sub(r"[^A-Za-z]", " ", text)
    
    text = words_lemmatizer(text)
    
    return text


def split_sent(x):
    x= nltk.sent_tokenize(x)
    return x

def get_clean_tokens(sentence):
    
    cleaned = clean_text(sentence)
    
    tokens = nltk.word_tokenize(cleaned)
    
    return tokens

# apply the above function to df['text']
import copy
from copy import deepcopy
def tokenize_sentences(df):
    #Avoid directly modify original dataframe
    df_temp = copy.deepcopy(df)
   
    df_temp['sentences'] = df_temp['Text'].apply(split_sent)
    
    df_temp['tokenized_sentences'] = list(map(lambda sentences: list(map(get_clean_tokens, sentences)), 
                                              df_temp.sentences))
    #Remove all list == []
    df_temp['tokenized_sentences'] = list(map(lambda sentences: list(filter(lambda lst: lst, sentences)),
                                         df_temp.tokenized_sentences)) 
    
    return df_temp[['Outcome','tokenized_sentences']]
    
train_data = tokenize_sentences(train)



#Train word2vec model

from gensim.models import Word2Vec
train_sentences = [sentence for sentences in train_data.tokenized_sentences for sentence in sentences]

W2Vmodel = Word2Vec(sentences=train_sentences, vector_size=50,sg=1, hs=0, workers=4, min_count=3, window=8,
                    sample=1e-3, negative=5)

print('W2Vbuilt')



from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(train['Text'], train['Outcome'], test_size=0.2, stratify = train['Outcome'],random_state=3)

df = pd.DataFrame()
df['Text']= deepcopy(X_train)
df['Outcome'] = deepcopy(y_train)

df.head()


df['Outcome'].value_counts()

val= pd.DataFrame()
val['Text']= deepcopy(X_val)
val['Outcome'] = deepcopy(y_val)


df_resample_1 = deepcopy(df[df['Outcome']==1]).sample(380, replace=True)
df_resample_2 = deepcopy(df[df['Outcome']==2]).sample(500, replace=True)

df_resample = pd.concat([df_resample_1, df_resample_2,df],  ignore_index=True).sample(frac=1).reset_index(drop=True)



df_resample['Text'] = df_resample['Text'].apply(clean_text)

val['Text'] = val['Text'].apply(clean_text)
test['Text'] = test['Text'].apply(clean_text)




from sklearn.datasets import make_classification
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

len_95 = 200


#Set a number which is larger than vocab to keep all useful information
NUM_WORDS = 10000
tokenizer = Tokenizer(num_words=NUM_WORDS,filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\'', lower=True)
tokenizer.fit_on_texts(df_resample['Text'])
X_train_sequences = tokenizer.texts_to_sequences(df_resample['Text'])
X_eva_sequences = tokenizer.texts_to_sequences(val['Text'])
X_test_sequences = tokenizer.texts_to_sequences(test['Text'])
word_index = tokenizer.word_index

#Make input to be same length
X_train = pad_sequences(X_train_sequences, maxlen=int(len_95))
X_val = pad_sequences(X_eva_sequences, maxlen=int(len_95))
X_test = pad_sequences(X_test_sequences, maxlen=int(len_95))

y_train_dummy = pd.get_dummies(df_resample['Outcome']).values
y_test_dummy = pd.get_dummies(val['Outcome']).values

print('Shape of label tensor:', y_test_dummy.shape)

import gensim
from gensim.utils import simple_preprocess
from gensim.models.keyedvectors import KeyedVectors

#Loading previous trained word2vec model
word_vectors = W2Vmodel.wv
EMBEDDING_DIM=50 #The same as word2vec features

vocabulary_size=min(len(word_index)+1,NUM_WORDS)
embedding_matrix = np.zeros((vocabulary_size, EMBEDDING_DIM))

#Give embedding matrix value according to word2vec model
for word, i in word_index.items():
    if i < vocabulary_size:     
        try:
            embedding_matrix[i] = word_vectors[word]
        except KeyError:
            #Ignore words not exist in train
            embedding_matrix[i] = np.random.normal(0,np.sqrt(0.25),EMBEDDING_DIM)
    else: continue
del(word_vectors)

W2Vbuilt
Shape of label tensor: (254, 3)


In [21]:
from tensorflow.keras.models import Sequential
from sklearn.metrics import mean_squared_error 
from sklearn.preprocessing import MinMaxScaler  
from tensorflow.keras.layers import Dense 
from tensorflow.keras.layers import LSTM 
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Conv2D, Conv1D
from tensorflow.keras.layers import Input
from tensorflow.keras import Model
from tensorflow.keras.layers import GlobalMaxPooling1D,GlobalMaxPooling2D
from tensorflow.keras.layers import MaxPooling2D, Flatten, Reshape,MaxPooling1D
from tensorflow.keras.layers import Dropout
from tensorflow.keras.callbacks import ModelCheckpoint
from math import sqrt 
from tensorflow import keras

In [22]:
for j in [0,1,2,3,4]:
    for l in [0.01]: #[0.1,0.05,0.01,0.005]:
        for f in [32]:#[32,64]:
            for ls in [40]:#[20, 40,80]:
                print(l,f,ls)
                model = Sequential()
                model.add(Embedding(vocabulary_size, EMBEDDING_DIM, input_length=200,weights=[embedding_matrix],
                                            trainable=True))
                model.add(Conv1D(filters=f, kernel_size=3, padding='same', activation='relu'))
                model.add(MaxPooling1D(pool_size=2))
                model.add(LSTM(ls))
                
                model.add(Dense(3, activation='softmax'))
                                    
                                              
                optimizer = keras.optimizers.Adam(lr=l)
                model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
                name1 = 'weights_best_5153projet.hdfs'
                filepath= name1
                checkpoint = ModelCheckpoint(filepath, monitor='val_loss', save_best_only=True, verbose=1,  mode='min',save_weights_only=True)
                callbacks_list = [checkpoint]
                model.fit(X_train, y_train_dummy, epochs=5,batch_size = 10, verbose = 0,workers=4, use_multiprocessing=True,callbacks = callbacks_list,validation_data=(X_val,y_test_dummy))
                
                from sklearn.metrics import confusion_matrix
                pred1 = list()
                y_pred1 = model.predict(X_val)
                from sklearn.metrics import classification_report
                
                for i in range(len(y_pred1)):
                    pred1.append(np.argmax(y_pred1[i]))
                
                print(confusion_matrix(val['Outcome'], pred1))
                
                pred2 = list()
                y_pred2 = model.predict(X_test)
                
                    
                for i in range(len(y_pred2)):
                    pred2.append(np.argmax(y_pred2[i]))
                    
                print(classification_report(test['Outcome'], pred2, labels=[0,1,2]))
                
                
                print(confusion_matrix(test['Outcome'], pred2))
                
                
                from tensorflow.keras.backend import clear_session
                clear_session()


0.01 32 40

Epoch 00001: val_loss improved from inf to 1.02208, saving model to weights_best_5153projet.hdfs

Epoch 00002: val_loss did not improve from 1.02208

Epoch 00003: val_loss did not improve from 1.02208

Epoch 00004: val_loss did not improve from 1.02208

Epoch 00005: val_loss did not improve from 1.02208
[[136  19   4]
 [ 44  16   4]
 [ 19   4   8]]
              precision    recall  f1-score   support

           0       0.65      0.91      0.76       199
           1       0.48      0.19      0.27        80
           2       0.33      0.08      0.12        39

    accuracy                           0.63       318
   macro avg       0.49      0.39      0.38       318
weighted avg       0.57      0.63      0.56       318

[[181  13   5]
 [ 64  15   1]
 [ 33   3   3]]
0.01 32 40

Epoch 00001: val_loss improved from inf to 1.06845, saving model to weights_best_5153projet.hdfs

Epoch 00002: val_loss did not improve from 1.06845

Epoch 00003: val_loss did not improve from 1.068