Exploring The Data


In [38]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

Mount Drive

In [39]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Reading Dataset


In [40]:
dataset =  pd.read_csv('/content/drive/My Drive/Assignment 2/train.tsv', sep='\t')
dataset = dataset.dropna()
dataset.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [41]:
dataset.shape

(156060, 4)

In [42]:
dataset.Sentiment.value_counts()

2    79582
3    32927
1    27273
4     9206
0     7072
Name: Sentiment, dtype: int64

Adjustable Parameters

In [0]:
remove_fPunct = True
fTokenizaton = True
fStopwords = True
fStemming = False
fLemmatization = True


Data Cleaning | Punctuations

In [44]:
import string
print(string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [0]:
def remove_punctuation(text):
  txt_nonpunct = "".join([a for a in text if a not in string.punctuation])
  return txt_nonpunct 

In [0]:
if remove_fPunct:
  dataset['Phrase'] = dataset['Phrase'].apply(lambda x: remove_punctuation(x)) 

Data Cleaning | Tokenization

In [0]:
import re

def tokenize(text):
  tokens = re.split('\W+', text)
  return tokens 

if fTokenizaton:
  dataset['Phrase'] = dataset['Phrase'].apply(lambda x: tokenize(x.lower()))

Data Cleaning | Stop Words

In [48]:
# import nltk
# nltk.download('stopwords')

# h = nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
stopwords[0:10]
# h

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [0]:
def remove_stopwords(txt_tokenized):
  txt_clean = [word for word in txt_tokenized if word not in stopwords]
  return txt_clean

if fStopwords:
  dataset['Phrase'] = dataset['Phrase'].apply(lambda x: remove_stopwords(x))

Data Cleaning | Stemming

In [0]:
from nltk.stem import PorterStemmer
# from nltk.stem import snowballstemmer
ps = PorterStemmer()


In [0]:
def stemming(tokenized_text):
  text = [ps.stem(word) for word in tokenized_text]
  return text

In [0]:
if fStemming:
  dataset['Phrase'] = dataset['Phrase'].apply(lambda x: stemming(x))

Data Cleaning | Lemmatization

In [0]:
wn = nltk.WordNetLemmatizer()
ps = nltk.PorterStemmer()

def lemmatization(token_txt):
  text = [wn.lemmatize(word) for word in token_txt]
  return text


In [0]:
if fLemmatization:
  dataset['Phrase'] = dataset['Phrase'].apply(lambda x: lemmatization(x))

In [55]:
dataset.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,"[series, escapade, demonstrating, adage, good,...",1
1,2,1,"[series, escapade, demonstrating, adage, good,...",2
2,3,1,[series],2
3,4,1,[],2
4,5,1,[series],2


In [56]:
import random
random.seed(9001)
newData = dataset.to_numpy()
print(newData)
np.random.shuffle(newData)
# print("New data", newData)

xyz = pd.DataFrame(newData,columns=['PhraseId','SentenceId','Phrase','Sentiment'])
# xyz.head()

dataset = xyz
dataset.head()

[[1 1
  list(['series', 'escapade', 'demonstrating', 'adage', 'good', 'goose', 'also', 'good', 'gander', 'occasionally', 'amuses', 'none', 'amount', 'much', 'story', ''])
  1]
 [2 1
  list(['series', 'escapade', 'demonstrating', 'adage', 'good', 'goose'])
  2]
 [3 1 list(['series']) 2]
 ...
 [156058 8544 list(['avuncular', 'chortle']) 3]
 [156059 8544 list(['avuncular']) 2]
 [156060 8544 list(['chortle']) 2]]


Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,97453,5096,[android],2
1,58468,2946,"[favor, mushy, obviousness]",2
2,58182,2932,"[merely, kicking, undead, ]",2
3,120984,6479,"[played, ryan, gosling]",2
4,61856,3127,"[movie, weighs, glass, flat, champagne, ]",1


Splitting The Dataset

In [57]:

X_train, X_test, Y_train, Y_test = train_test_split(dataset['Phrase'], dataset['Sentiment'], test_size=0.3, random_state=2003)
documents = []
X_train = np.array(X_train.values.tolist())
Y_train = np.array(Y_train.values.tolist())
for i in range(len(X_train)):
  documents.append([list(X_train[i]), Y_train[i]]) 

X_test = np.array(X_test.values.tolist())
Y_test = np.array(Y_test.values.tolist())
for i in range(len(X_test)):
  documents.append([list(X_test[i]), Y_test[i]]) 

print(documents[0][0])
# for i in documents:
#   print(i)
# for i range(len(documents)):
#   label = documents[i][1]
#   documents[i] = 

dataset = pd.DataFrame(documents, columns=['text', 'sentiment']) 
dataset['join'] = dataset.text.apply(' '.join)
# print(documents)
dataset.head()

['never', 'truly', 'come', 'care', 'main', 'character', 'whether', 'wind', 'together']


Unnamed: 0,text,sentiment,join
0,"[never, truly, come, care, main, character, wh...",1,never truly come care main character whether w...
1,"[nt, funny]",1,nt funny
2,"[seller, smart, woman]",2,seller smart woman
3,"[could, make, decent, budget]",3,could make decent budget
4,"[star, nia, vardalos]",2,star nia vardalos


In [0]:
X_train, X_test, Y_train, Y_test = train_test_split(dataset['join'],  dataset['sentiment'], test_size=0.3, random_state=2003)

In [0]:
from sklearn.feature_extraction.text import CountVectorizer , TfidfVectorizer
from keras.utils import to_categorical

vectorizer = TfidfVectorizer(max_features = 2500)
X = vectorizer.fit_transform(dataset["join"]) 
Y = dataset['sentiment'] 
 
X_train = vectorizer.transform(X_train).toarray()
Y_train = Y_train 
X_test = vectorizer.transform(X_test).toarray()
Y_test = Y_test

In [60]:
Y_test

13510     2
61932     2
82549     3
137718    3
121990    3
         ..
94224     2
135456    2
154729    1
23031     1
57870     1
Name: sentiment, Length: 46818, dtype: int64

In [0]:
import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv1D, MaxPooling1D
from keras import backend as K

In [0]:
num_classes = 5

In [0]:
from keras import backend as K

def recall_method(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_method(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_score(y_true, y_pred):
    precision = precision_method(y_true, y_pred)
    recall = recall_method(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))


In [64]:
X_train.shape

(109242, 2500)

In [65]:
Y_train = keras.utils.to_categorical(Y_train, num_classes)
Y_test = keras.utils.to_categorical(Y_test, num_classes)
Y_test

array([[0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0.],
       ...,
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.]], dtype=float32)

In [66]:
model = Sequential()
model.add(Conv1D(filters=64, kernel_size=3,
                 activation='relu',
                 input_shape=(2500,1)))

model.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
model.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
model.add(MaxPooling1D(pool_size=1))
model.add(Dropout(rate = 0.25))
model.add(Flatten())
model.add(Dense(64, activation='relu'))

model.add(Dense(num_classes, activation='softmax'))

model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_4 (Conv1D)            (None, 2498, 64)          256       
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 2496, 64)          12352     
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 2494, 64)          12352     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 2494, 64)          0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 2494, 64)          0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 159616)            0         
_________________________________________________________________
dense_3 (Dense)              (None, 64)               

In [0]:
# model.compile(loss=keras.losses.categorical_crossentropy,optimizer=keras.optimizers.Adadelta(),metrics=['accuracy'])

model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=keras.optimizers.Adam(),
              metrics=['accuracy',f1_score,precision_method,recall_method])

In [0]:
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

In [0]:
model.fit(X_train, Y_train,
          batch_size=32,
          epochs=10)

score = model.evaluate(X_test, Y_test, verbose=0)


print('Test loss:', score[0])
print('Test accuracy:', score[1])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
  6432/109242 [>.............................] - ETA: 1:50 - loss: 0.7059 - acc: 0.7220 - f1_score: 0.7131 - precision_method: 0.7466 - recall_method: 0.6835

In [0]:
from keras.models import load_model

model.save('/content/drive/My Drive/Assignment 2/1117269_1dconv_reg.h5')