In [1]:
import pandas as pd
import numpy as np

In [2]:
#loading the data
df=pd.read_csv('facebook_comments.csv', header=None, names=['text','sentiment'] )
#display the first five recors
#df.head()
df['sentiment']=df.sentiment.str.strip()
#create lables
df['labels'] = df.sentiment.map({'neutral':0,'negative':1,'positive':2})
df.head()
#convert text and lables into ndarrays using numpy 
training_text= df.text.values
labels=df.labels.values
print(type(training_text),type(labels))
#training_text


<class 'numpy.ndarray'> <class 'numpy.ndarray'>


### Preprocess data


In [3]:
# preprocess the loaded textual data, including removing stopwords, stemming, and tokenization
# represent each document (i.e., comment) using TF-IDF strategy. The features are the top frequent unigrams across all comments
# possible packages you might need are: scikit-learn, numpy
from sklearn.feature_extraction.text import TfidfVectorizer
# tokenize and create a document-feature matrix X and a label vector Y
vectorizer=TfidfVectorizer(stop_words='english', max_features=500, ngram_range=(1,1)) #unigram
#using top frequent 500 unigram as features
#apply this object the data to be transformed
instances= vectorizer.fit_transform(training_text) #not the array its 1999x500 sparse matrix of type
#convert to array
X= instances.toarray()
Y=labels
# print out the shape of X and Y
print(X.shape,',',Y.shape)
#X
#Y

(1999, 500) , (1999,)


### Traditional Machine Learning Models: Random Forest


In [4]:
# using 10-fold cross-validation to show the prediction accuracy
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
# possible packages you might need are: scikit-learn, numpy
k_fold= KFold(n_splits=10,shuffle=True, random_state=2020)
rf_model= RandomForestClassifier(criterion='entropy',max_depth=2, random_state=2020) #empty model
# there are 10 folds record accuracy for each
rf_cvscores=[]
for train_idx, test_idx in k_fold.split(X): # returns indexes of the records, once we have the index we can get the entire row vector
    rf_model.fit(X[train_idx],Y[train_idx]) #train model on data, when you train you superwise provide X and Y
    acc= rf_model.score(X[test_idx],Y[test_idx])                  # get the accuracy on the validatio set
    rf_cvscores.append(acc)                                         #for every iteration appaned the list
    
print("Random Forest - mean: %.4f%% (std: +/- %.4f%%)" % (np.mean(rf_cvscores)*100,np.std(rf_cvscores)*100))



Random Forest - mean: 64.1832% (std: +/- 2.0458%)


### Fully connected feedforward Neural Network


In [5]:
%conda install -c pytorch pytorch torchvision

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.


Note: you may need to restart the kernel to use updated packages.


In [6]:
# Design your own network with the following requirements:
# 1. Having dropout
# 2. Separate the dataset into training and validation (80-20%)
# 3. The prediction accuracy on the validation set should be at least 50% for this 3-
# possible packages you might need are: scikit-learn, numpy, torch
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import torch.optim as optim

### Build the train loader and validation loader

In [29]:
# convert your numpy array to TensorDataset and create a data loader for training and
# some hyperparameters: input dimension, output dimension, batch size, number of epoc
batch_size = 16 #for each epoch 16 batches to load a batch we need to create a data loader convert arrays into tensors
epochs = 50
lr = 1e-4
indim = X.shape[1] #input dimensionality 
outdim = 3  #output dimensionality 3, {0,1,2}, 3 class classification problem
drate = 0.7 #droprate
#X.shape[0]
#step1: Create X.tensor & Y.tensor from numpy array 
X_tensor= torch.from_numpy(X)
Y_tensor= torch.from_numpy(Y)
#Step2: Creat a dataset, Vertiaclly concatinate them together(when loader is called this dataset will be included in each batch)
dataset= TensorDataset(X_tensor,Y_tensor)
#Step3: Seperate the dataset into train and test set
train_size= int(0.8*len(dataset))
train_size
#test_size=int(0.2*len(dataset))
#test_size
test_size=len(dataset)-train_size
#test_size
train_dataset, test_dataset= torch.utils.data.random_split(dataset,[train_size,test_size]) #using random split create training and testing dataset
#len(train_dataset)
#Step4: Create train loader and test loader 
train_loader= DataLoader(train_dataset,batch_size= batch_size, shuffle=True)
test_loader= DataLoader(test_dataset,batch_size= batch_size, shuffle=True)

### Build the network


In [30]:
# create your model/network
#from class many objects can be instanciated,SentimentNetwork is a subclass inherited from as super/root class in pytorch from NN which is nn.Module
class SentimentNetwork(nn.Module): 
    def __init__(self, input_dim, output_dim, drate):
        super(SentimentNetwork,self).__init__()
        #using the self. variable becomes a class variable as the property of the class if not its a local variable
        #suppose hidden layer 1 has 500 neurons and layer 2 has 50
        self.fc1= nn.Linear(input_dim, 100)
        self.do1=nn.Dropout(drate)
        self.fc2= nn.Linear(100, 50)
        self.do2=nn.Dropout(drate)
        self.opt= nn.Linear(50, output_dim)
         
        
    #forward function is to get the predicated outout, calculate the loss
    # x could be an instance, batch, many instances
    #pass x through hidden layer, call the hidden layers defined above
    def forward(self,x):
        #x with information aggregation and transformation
        x= F.relu(self.fc1(x))
        x= self.do1(x)
        x= F.relu(self.fc2(x))
        x= self.do2(x)
        x= F.log_softmax(self.opt(x))
        x=torch.exp(x)
        return x

In [31]:
# create a model
model = SentimentNetwork(indim,outdim,drate)
print(model)

SentimentNetwork(
  (fc1): Linear(in_features=500, out_features=100, bias=True)
  (do1): Dropout(p=0.7, inplace=False)
  (fc2): Linear(in_features=100, out_features=50, bias=True)
  (do2): Dropout(p=0.7, inplace=False)
  (opt): Linear(in_features=50, out_features=3, bias=True)
)


### Create a training function to train the model and an evaluation function to evaluate the performance on the separate validation set


In [32]:
#define the loss function and optimizer
#call crossentropyloss and adam    
criterion= nn.CrossEntropyLoss() 
#prediction: batch_size X output_dim, truth: batch_size X 1
optimizer=torch.optim.Adam(model.parameters(),lr=lr)
                                                                 

In [33]:
# define a training process function for one epoch each
def train(model, train_loader, optimizer, criterion):
    epoch_loss, epoch_acc = 0.0,0.0 # the loss and accuracy for each epoch
    acc=0
    model.train()
    for batch_x, batch_y in train_loader:
        #zero gradient
        optimizer.zero_grad()
        #prediction= calculate the predicted output for the current batch batch_x
        net_out= model(batch_x.float()) #models predicted output 
        #convert torch variable to numpy: predictions.detach().numpy()
        #net_out.detach.numpy()
        #loss= calculate the loss for the current batch using predictions and batch_y
        loss= criterion(net_out, batch_y)
        #to get the value from tensor to compare
        loss_val= loss.item()
        #acc= calculate the accuracy using predictions(dimensionality is batch_size x output_dim) and batch_y(dimensionality= batch_sizex1)
        predicted= net_out.argmax(1)
        acc=predicted.eq(batch_y).sum().item ()
        # backpropogate   #iteration can be applied to tensor 
        loss.backward()
        optimizer.step()
        
        
        
        epoch_loss += loss.item()  #to get the value from tensor
        epoch_acc += acc
       
        #calculate the average epoch_loss and epoch_acc 
    average_epoch_loss = epoch_loss/len(train_dataset)
    average_epoch_acc = epoch_acc/len(train_dataset)
    return   average_epoch_loss, average_epoch_acc


# define a validation/evaluation process function
def evaluate(model, test_loader, criterion):
    epoch_loss, epoch_acc = 0.0,0.0 # the loss and accuracy for each epoch
    acc=0
    model.eval()
    #no need for any parameter updating, done for this epoch only evaluate on the parameters learned upto this point no gradient
    with torch.no_grad():
        for batch_x, batch_y in test_loader:
            #predict the output
            net_out= model(batch_x.float())
            #calculate the loss
            loss= criterion(net_out, batch_y)
            loss_val= loss.item()
            #calculate the accuracy
            predicted= net_out.argmax(1)
            acc=predicted.eq(batch_y).sum().item ()
            epoch_loss += loss.item()
            epoch_acc += acc
            #calculate the average epoch_loss and epoch_acc
    average_epoch_loss = epoch_loss/len(test_dataset)
    average_epoch_acc = epoch_acc/len(test_dataset)
    return average_epoch_loss, average_epoch_acc
  

### Main starting point: train the model and evaluate the model


In [34]:

# real training and evaluation process
for epoch in range(epochs):
    train_loss, train_acc = train(model, train_loader, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, test_loader, criterion)
    
    print(f'Epoch: {epoch+1:02}')
    print(f'\tTrain Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f}')
    print(f'\t Val. Loss: {valid_loss:.4f} |  Val. Acc: {valid_acc:.4f}')



Epoch: 01
	Train Loss: 0.0686 | Train Acc: 0.3846
	 Val. Loss: 0.0684 |  Val. Acc: 0.6650
Epoch: 02
	Train Loss: 0.0682 | Train Acc: 0.5422
	 Val. Loss: 0.0678 |  Val. Acc: 0.6650
Epoch: 03
	Train Loss: 0.0676 | Train Acc: 0.6179
	 Val. Loss: 0.0670 |  Val. Acc: 0.6650
Epoch: 04
	Train Loss: 0.0667 | Train Acc: 0.6341
	 Val. Loss: 0.0657 |  Val. Acc: 0.6650
Epoch: 05
	Train Loss: 0.0650 | Train Acc: 0.6348
	 Val. Loss: 0.0633 |  Val. Acc: 0.6650
Epoch: 06
	Train Loss: 0.0627 | Train Acc: 0.6354
	 Val. Loss: 0.0603 |  Val. Acc: 0.6650
Epoch: 07
	Train Loss: 0.0604 | Train Acc: 0.6354
	 Val. Loss: 0.0577 |  Val. Acc: 0.6650
Epoch: 08
	Train Loss: 0.0590 | Train Acc: 0.6354
	 Val. Loss: 0.0562 |  Val. Acc: 0.6650
Epoch: 09
	Train Loss: 0.0576 | Train Acc: 0.6354
	 Val. Loss: 0.0555 |  Val. Acc: 0.6650
Epoch: 10
	Train Loss: 0.0572 | Train Acc: 0.6354
	 Val. Loss: 0.0550 |  Val. Acc: 0.6650
Epoch: 11
	Train Loss: 0.0571 | Train Acc: 0.6354
	 Val. Loss: 0.0548 |  Val. Acc: 0.6650
Epoch: 12


### as the number of epochs increased the accuracy of the model improved with 64% for 5 to ~80% for 50