<a href="https://colab.research.google.com/github/wojtekgradzinski/WojtekRepo/blob/main/Amazon_Reviews_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import torch
from sklearn.model_selection import train_test_split
import pandas as pd
import spacy 

#!pip install fasttext
from torchtext.vocab import FastText
from collections import Counter
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm, tqdm_notebook

In [42]:
#Getting data

df = pd.read_csv('data_twitter.csv', encoding='ISO-8859-1', names=["target", "ids", "date", "flag", "user", "text"])

df

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
...,...,...,...,...,...,...
1599995,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...
1599996,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...


In [43]:
#checking unique values 0-negative  , 4 - positive
df.target.unique()

array([0, 4], dtype=int64)

In [44]:
#Creating random samples from big data
df = df.sample(50000).reset_index()



In [45]:
df.target.value_counts()

4    25198
0    24802
Name: target, dtype: int64

In [46]:
#Creating labels 

possible_labels = df.target.unique()
label_dict = {}
for index, label in enumerate(possible_labels):
    label_dict[label] = index
# Let's make label 0 == negative, 1 == positive

df['label'] = df.target.replace(label_dict)
df = df.drop(['date', 'flag', 'user', 'ids', 'target', 'index'], axis=1)
df.head(10)

Unnamed: 0,text,label
0,"kind of sad, confused and lost. which leads me...",0
1,why does today have to be monday? (and who is...,0
2,"@serahhh oh i'm so sorry, bb. you know where ...",0
3,My head hurts sooo bad right now! Ugh...I hate...,0
4,I miss david but he's kinda mad at me,0
5,@snowed_in I loved that movie! And I miss Raul...,0
6,@zamon no thanl god ! @bexxi pulled it and I w...,1
7,Arrested in the hotel,1
8,@a_smart_union Can't answer you DM if you're n...,0
9,plus its raining &amp;&amp; i gotta cycle frm ...,0


In [47]:
df.label.unique()

array([0, 1], dtype=int64)

In [48]:
fasttext = FastText("simple")
nlp = spacy.load("en_core_web_sm")

def preprocessing(sentence):
    """
    params sentence: a str containing the sentence we want to preprocess
    return the tokens list
    """
    doc = nlp(sentence)
    tokens = [token.lemma_ for token in doc if not token.is_punct and not token.is_stop]
    return tokens

def token_encoder(token, vec):
    if token == "<pad>":
        return 1
    else:
        try:
            return vec.stoi[token]
        except:
            return 0


def encoder(tokens, vec):
    return [token_encoder(token, vec) for token in tokens]  


def padding(list_of_indexes, max_seq_len, padding_index=1):
    output = list_of_indexes + (max_seq_len - len(list_of_indexes))*[padding_index]
    return output[:max_seq_len]      

In [49]:
class TrainData(Dataset):
    def __init__(self, df, max_seq_len=32): # df is the input df, max_seq_len is the max lenght allowed to a sentence before cutting or padding
        self.max_seq_len = max_seq_len
        
        counter = Counter()
        train_iter = iter(df.text.values)
        self.vec = FastText("simple")
        self.vec.vectors[1] = -torch.ones(self.vec.vectors[1].shape[0]) # replacing the vector associated with 1 (padded value) to become a vector of -1.
        self.vec.vectors[0] = torch.zeros(self.vec.vectors[0].shape[0]) # replacing the vector associated with 0 (unknown) to become zeros
        self.vectorizer = lambda x: self.vec.vectors[x]
        self.labels = df.label
        sequences = [padding(encoder(preprocessing(sequence), self.vec), max_seq_len) for sequence in df.text.tolist()]
        self.sequences = sequences
    
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, i):
        assert len(self.sequences[i]) == self.max_seq_len
        return self.sequences[i], self.labels[i]

In [50]:
#70/30 trian_test_split

def train_test_split(df):
   
    rows = round(df.shape[0] * 0.7) 
    train = df.iloc[:rows]
    test = df.iloc[rows:]
    return train, test
    
train , test = train_test_split(df)



In [51]:
test = test.reset_index(drop=True)
test

Unnamed: 0,text,label
0,Watch it halfway through then realized I had w...,0
1,I noticed all the good tweets seem to happen w...,0
2,@Mel66 They still need to add custom keyword s...,0
3,Needs to find the hole in my air bed.,0
4,I hope my day goes smoothly cause feelin like ...,0
...,...,...
14995,good morning/night everyone,1
14996,@penitch awwww..... thought it'd be a &quot;yu...,1
14997,i miss them,0
14998,I WAS GONNA SING HOT N COLD BY KATY PERRY @ MY...,0


In [52]:
#checking nan values
print(f'Test nan values = {test.label.isnull().sum()}')
print(f'Train nan values = {train.label.isnull().sum()}')

Test nan values = 0
Train nan values = 0


In [53]:
dataset = TrainData(train, max_seq_len=32)
dataset2 = TrainData(test, max_seq_len=32)

In [54]:
def collate(batch, vectorizer=dataset.vectorizer):
    inputs = torch.stack([torch.stack([vectorizer(token) for token in sentence[0]]) for sentence in batch])
    target = torch.LongTensor([item[1] for item in batch]) # Use long tensor to avoid unwanted rounding
    return inputs, target

In [15]:
batch_size = 16
train_loader = DataLoader(dataset, batch_size=batch_size, collate_fn=collate)
test_loader = DataLoader(dataset2, batch_size=batch_size, collate_fn=collate)

In [55]:
next(iter(test_loader))[0].shape

torch.Size([16, 32, 300])

In [56]:
#Costruction of nn

from torch import nn
import torch.nn.functional as F

emb_dim = 300
class Classifier(nn.Module):
    def __init__(self, max_seq_len, emb_dim, hidden1=16, hidden2=16, hidden3=16):
        super(Classifier, self).__init__()
        self.fc1 = nn.Linear(max_seq_len*emb_dim, hidden1)
        self.fc2 = nn.Linear(hidden1, hidden2)
        self.fc3 = nn.Linear(hidden2, hidden3)
        self.fc4 = nn.Linear(hidden3, 5)
        self.out = nn.LogSoftmax(dim=1)
    
    
    def forward(self, inputs):
        x = F.relu(self.fc1(inputs.squeeze(1).float()))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = self.fc4(x)
        return self.out(x)

In [57]:
MAX_SEQ_LEN = 32
model = Classifier(MAX_SEQ_LEN, 300, 16, 16, 16)
model

Classifier(
  (fc1): Linear(in_features=9600, out_features=16, bias=True)
  (fc2): Linear(in_features=16, out_features=16, bias=True)
  (fc3): Linear(in_features=16, out_features=16, bias=True)
  (fc4): Linear(in_features=16, out_features=5, bias=True)
  (out): LogSoftmax(dim=1)
)

In [58]:
#Setting up optimizer & criterion

from torch import optim
criterion = nn.CrossEntropyLoss()

optimizer = optim.SGD(model.parameters(), lr=0.001,momentum=0.9)

dataiter = iter(train_loader)
sentences, labels = dataiter.next()

# Forward pass through the network

sentence_idx = 0
sentences.resize_(16, 1, MAX_SEQ_LEN*emb_dim).shape
log_ps = model.forward(sentences[sentence_idx,:])

sentence = sentences[sentence_idx]
#torch.exp(log_ps)

In [59]:
#Training the model

epochs = 500
print_every = 250

for e in range(epochs):
    running_loss = 0
    print(f"Epoch: {e+1}/{epochs}")

    for i, (sentences, labels) in enumerate(iter(train_loader)):

        sentences.resize_(sentences.size()[0], 32* emb_dim)
        optimizer.zero_grad()
        
        output = model.forward(sentences)   # 1) Forward pass
        loss = criterion(output, labels) # 2) Compute loss
        loss.backward()                  # 3) Backward pass
        optimizer.step()                 # 4) Update model
        
        running_loss += loss.item()
        
        if i % print_every == 0:
            print(f"\tIteration: {i}\t Loss: {running_loss/print_every:.4f}")
            running_loss = 0

Epoch: 1/500
	Iteration: 0	 Loss: 0.0063
	Iteration: 250	 Loss: 0.7580
	Iteration: 500	 Loss: 0.7062
	Iteration: 750	 Loss: 0.7074
	Iteration: 1000	 Loss: 0.7013
	Iteration: 1250	 Loss: 0.7036
	Iteration: 1500	 Loss: 0.6984
	Iteration: 1750	 Loss: 0.6999
	Iteration: 2000	 Loss: 0.7020
Epoch: 2/500
	Iteration: 0	 Loss: 0.0028
	Iteration: 250	 Loss: 0.7036
	Iteration: 500	 Loss: 0.6982
	Iteration: 750	 Loss: 0.6993
	Iteration: 1000	 Loss: 0.6978
	Iteration: 1250	 Loss: 0.6985
	Iteration: 1500	 Loss: 0.6963
	Iteration: 1750	 Loss: 0.6973
	Iteration: 2000	 Loss: 0.7006
Epoch: 3/500
	Iteration: 0	 Loss: 0.0028
	Iteration: 250	 Loss: 0.7001
	Iteration: 500	 Loss: 0.6965
	Iteration: 750	 Loss: 0.6979
	Iteration: 1000	 Loss: 0.6968
	Iteration: 1250	 Loss: 0.6971
	Iteration: 1500	 Loss: 0.6957
	Iteration: 1750	 Loss: 0.6964
	Iteration: 2000	 Loss: 0.6993
Epoch: 4/500
	Iteration: 0	 Loss: 0.0028
	Iteration: 250	 Loss: 0.6985
	Iteration: 500	 Loss: 0.6956
	Iteration: 750	 Loss: 0.6972
	Iteration:

In [26]:
#Computing accuracy with ADAM optimizer

acc = 0
for i, (sentences, label) in enumerate(iter(test_loader)):
    sentences.resize_(sentences.size()[0], 32* emb_dim)
    y_probs = model.forward(sentences)
    y_pred = nn.functional.softmax(y_probs,dim = 1)
    for j in range(y_pred.shape[0]):
        if y_pred[j].argmax() == label[j] :
            acc += 1
acc / (len(test_loader) * batch_size)

0.6725079957356077

In [60]:
#Computing accuracy with SGD optimizer & Cross Entropy loss

import numpy as np
acc = 0
for i, (sentences, label) in enumerate(iter(test_loader)):
    sentences.resize_(sentences.size()[0], 32* emb_dim)
    y_probs = model.forward(sentences)
    y_pred = nn.functional.softmax(y_probs,dim = 1)
    for j in range(y_pred.shape[0]):
        if y_pred[j].argmax() == label[j] :
            acc += 1
acc / (len(test_loader) * batch_size)

0.6786380597014925