start from a raw corpus, clean it (removing stop words, lemmatization etc.) and perform a sentiment analysis

Main libraries to use for preprocessing: nltk, spacy (maybe make a comaparison)

Model: small MLP (1 hidden layer) with ks

---
Format: sentence \t score \n

---

Score is either 1 (for positive) or 0 (for negative)	

In [74]:
import tensorflow as tf
import keras as ks
import pandas as pd
import numpy as np
import nltk, spacy, os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [75]:
# Create a list of the available files excluding the readme.txt

pathlist = [f'./sentiment_txts/{txtname}' for txtname in os.listdir("sentiment_txts")]
pathlist

['./sentiment_txts/amazon_cells_labelled.txt',
 './sentiment_txts/imdb_labelled.txt',
 './sentiment_txts/yelp_labelled.txt']

In [76]:
# Read the files and separate them into text and label
# the readme states that the format is sentence \t score \n

sentencelist = []
labels = []

for path in pathlist:
    for line in open(path).readlines():
        sentence, score = line.split('\t')
        sentencelist.append(sentence)
        labels.append(int(score.split('\n')[0]))
        
labels = np.array(labels)
len(sentencelist)

3000

In [77]:
# We split the dataset into the usual train-test split using SkLearn

x_train_o, x_test_o, y_train, y_test = train_test_split(sentencelist, labels, test_size=0.2, random_state=42)

In [78]:
# We initialize the TfIdfVectorizer we'll use later

vectorizer = TfidfVectorizer(stop_words='english', lowercase=True, norm='l2')

In [87]:
# Extract te Tf-Idf from the phrase tokens, we use sklearn TfIdfVectorizer to do so

x_train = vectorizer.fit_transform(x_train_o).toarray().astype('float32')
x_test = vectorizer.transform(x_test_o).toarray().astype('float32') # Here it's mandatory to use transform as we did not learn every possible word during training

x_train.shape, x_test.shape

((2400, 4255), (600, 4255))

In [88]:
# Snippet of the current data structure

df = pd.DataFrame(x_train.transpose(), index=vectorizer.get_feature_names_out())
df[130:140]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2390,2391,2392,2393,2394,2395,2396,2397,2398,2399
agree,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ahead,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aimless,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
air,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aired,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
akasha,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
akin,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ala,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
alarm,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
alert,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [89]:
# Benchmark accuracy with sklearn perceptron

from sklearn.linear_model import Perceptron

classifier = Perceptron(random_state=42)
classifier.fit(x_train, y_train)

sum(classifier.predict(x_test) == y_test)/len(y_test)

0.7566666666666667

In [90]:
from keras.layers import Dense, Dropout, Activation
from keras import Sequential

In [91]:
# We create with keras sequential a single layer MLP

input_shape = (x_train.shape[1],)
num_classes = 1 # We have a single class to predict, which is either 1 if positive or 0 if negative


model = Sequential()
model.add(Dense(64,input_shape=input_shape, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [92]:
# Compile and fit the model using the correct losses and metrics

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(x_train, y_train, batch_size=16, epochs=15, verbose=1, validation_split=0.2)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x27b6be16310>

In [93]:
model.predict(x_test)[:,0][0:5] # As we can see te sigmoid function gives continuous outputs we have to round in order to obtain the correct label (0 or 1)

array([5.5393577e-04, 9.1115046e-01, 1.2307790e-01, 5.6758428e-01,
       4.0718311e-01], dtype=float32)

In [94]:
sum(np.round(model.predict(x_test)[:,0])==y_test)/len(y_test)

0.7983333333333333