<a href="https://colab.research.google.com/github/wanyun-yang/Neural-Network-1/blob/main/Wanyun_Yang_lab1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Load Data

In [214]:
# import the basic packages
import pandas as pd
import numpy as np
# load text data and convert the label/sentiment into corresponding numeric values: 
fname = 'facebook_comments.csv'
df_train = pd.read_csv(fname, header = None, names = ['text', 'sentiment'], encoding = 'iso-8859-1', lineterminator='\n')
sent = {'positive':2, 'neutral':1,'negative':0}
df_train['labels'] = df_train['sentiment'].str.strip().map(sent)

# get texts and labels
training_texts = df_train.text.values
labels = df_train.labels.values

print(type(training_texts),type(labels))

# show the first 5 records
df_train.head()

<class 'numpy.ndarray'> <class 'numpy.ndarray'>


Unnamed: 0,text,sentiment,labels
0,Heres a single to add to Kindle. Just read t...,neutral,1
1,If you tire of Non-Fiction.. Check out http://...,neutral,1
2,Ghost of Round Island is supposedly nonfiction.,neutral,1
3,Why is Barnes and Nobles version of the Kindle...,negative,0
4,@Maria: Do you mean the Nook? Be careful bo...,positive,2


# Preprocess Data

In [215]:
# preprocess the loaded textual data, including removing stopwords, stemming, and tokenization
# represent each document (i.e., comment) using TF-IDF strategy
from sklearn.feature_extraction.text import TfidfVectorizer

# tokenize and create a document-feature matrix X and a label vector Y
vectorizer = TfidfVectorizer(stop_words='english',max_features=500,ngram_range=(1,1))
instances = vectorizer.fit_transform(training_texts)
X = instances.toarray()
Y = labels

# print out the shape of X and Y
print(X.shape,',',Y.shape)

# print out part of X and Y
print(Y[:10])
print(X[0,:50])

(1999, 500) , (1999,)
[1 1 1 0 2 2 2 0 2 0]
[0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.28915636 0.         0.         0.
 0.         0.         0.2971592  0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.        ]


# Traditional Machine Learning Models: Random Forest

In [216]:
# using 10-fold cross-validation to show the prediction accuracy
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier

kfold = KFold(n_splits=10,shuffle=True,random_state=2020)
rf_model = RandomForestClassifier(criterion='entropy',max_depth=2,random_state=2020)
rf_cvscores = []

for train_idx,val_idx in kfold.split(X):
  rf_model.fit(X[val_idx],Y[val_idx])
  acc = rf_model.score(X[val_idx],Y[val_idx])
  rf_cvscores.append(acc)

print("Random Forest - mean: %.4f%% (std: +/- %.4f%%)" % (np.mean(rf_cvscores)*100, np.std(rf_cvscores)*100))

Random Forest - mean: 64.1332% (std: +/- 2.0919%)



# Fully connected feedforward Neural Network

In [234]:
# load packages
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import torch.optim as optim

Build the train loader and validation loader

In [235]:
# Parameters that can make the model better 
batch_size = 15
# when the epochs is 25 or above, the highest valiadation accuracy is around 94-95%, so no need to increase computational waste
epochs = 25
lr = 0.04
indim = X.shape[1]
outdim = 3
drate = 0.7

# Because of the drop out rate, the training data basically cannot be fixed, but I still added the seed to make it relatively more controlable.
torch.manual_seed(0)

# convert X and Y to matrixs
X_tensor = torch.from_numpy(X)
Y_tensor = torch.from_numpy(Y)

dataset = TensorDataset(X_tensor, Y_tensor)

# set 80% data to training set, the rest validation set
train_size = int(0.8*len(dataset))
val_size = len(dataset) - train_size
train_dataset,val_dataset = torch.utils.data.random_split(dataset,[train_size,val_size])

train_loader = DataLoader(train_dataset, shuffle=True,batch_size = batch_size)
val_loader = DataLoader(val_dataset, shuffle=True,batch_size=batch_size)

Build the network


In [236]:
# network model itself
class SentimentNetwork(nn.Module):
  def __init__(self, input_dim, output_dim, dropout_rate):

    super(SentimentNetwork,self).__init__()

    self.fc1 = nn.Linear(500,100)
    self.fc2 = nn.Linear(100,50)
    self.fc3 = nn.Linear(50,3)
    self.drop1 = nn.Dropout(drate)

  def forward(self,x):
    x = F.sigmoid(self.fc1(x))
    x = self.drop1(x)
    x = F.sigmoid(self.fc2(x))
    x = self.fc3(x)
    return F.log_softmax(x)

# Run the model and print the structure 
model = SentimentNetwork(indim,outdim,drate)
print(model)

SentimentNetwork(
  (fc1): Linear(in_features=500, out_features=100, bias=True)
  (fc2): Linear(in_features=100, out_features=50, bias=True)
  (fc3): Linear(in_features=50, out_features=3, bias=True)
  (drop1): Dropout(p=0.7, inplace=False)
)


Create a training function to train the model and an evaluation function to evaluate the
performance on the separate validation set

In [237]:
# define a training process function
def train(model, train_loader, optimizer, criterion):
  epoch_loss,epoch_acc = 0.0,0.0 
  model.train()
  for batch_x, batch_y in train_loader:
    #zero gradient
    optimizer.zero_grad()
    #prediction
    output = model(batch_x.float())
    #loss
    loss = criterion(output,batch_y)
    loss.backward()
    optimizer.step()
    #acc
    pred = output.data.max(1)[1]
    acc = pred.eq(batch_y.data).sum()

    epoch_loss += loss.item()
    epoch_acc += acc

  #calculate the average epoch_loss and epoch_acc
  epoch_loss/=len(train_loader.dataset)
  epoch_acc/=len(train_loader.dataset)   
  return epoch_loss, epoch_acc


# define a validation/evaluation process function

def evaluate(model, val_loader, criterion):
   
  epoch_loss,correct = 0.0,0.0

  model.eval()

  with torch.no_grad():
    for batch_x,batch_y in val_loader:
      #prediction
      output = model(batch_x.float())
      #loss
      epoch_loss += criterion(output,batch_y).data
      #acc
      pred = output.data.max(1)[1]
      correct += pred.eq(batch_y.data).sum()

  #calculate the average epoch_loss and epoch_acc
  epoch_loss/=len(val_loader.dataset)
  epoch_acc =correct/len(val_loader.dataset)
  return epoch_loss, epoch_acc


Main starting point: train the model and evaluate the model

In [238]:
# define the loss function and optimizer
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()

# real training and evaluation process
for epoch in range(epochs):
  train_loss, train_acc = train(model, train_loader, optimizer, criterion)
  valid_loss, valid_acc = evaluate(model, val_loader, criterion)
  print(f'Epoch: {epoch+1:02}')
  print(f'\tTrain Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f}')
  print(f'\t Val. Loss: {valid_loss:.4f} | Val. Acc: {valid_acc:.4f}')



Epoch: 01
	Train Loss: 0.0548 | Train Acc: 0.6423
	 Val. Loss: 0.0395 | Val. Acc: 0.7700
Epoch: 02
	Train Loss: 0.0381 | Train Acc: 0.7817
	 Val. Loss: 0.0336 | Val. Acc: 0.8175
Epoch: 03
	Train Loss: 0.0311 | Train Acc: 0.8286
	 Val. Loss: 0.0331 | Val. Acc: 0.7925
Epoch: 04
	Train Loss: 0.0282 | Train Acc: 0.8424
	 Val. Loss: 0.0298 | Val. Acc: 0.8150
Epoch: 05
	Train Loss: 0.0255 | Train Acc: 0.8587
	 Val. Loss: 0.0300 | Val. Acc: 0.8425
Epoch: 06
	Train Loss: 0.0225 | Train Acc: 0.8755
	 Val. Loss: 0.0269 | Val. Acc: 0.8700
Epoch: 07
	Train Loss: 0.0204 | Train Acc: 0.8862
	 Val. Loss: 0.0257 | Val. Acc: 0.8725
Epoch: 08
	Train Loss: 0.0189 | Train Acc: 0.8949
	 Val. Loss: 0.0258 | Val. Acc: 0.8825
Epoch: 09
	Train Loss: 0.0150 | Train Acc: 0.9256
	 Val. Loss: 0.0241 | Val. Acc: 0.8850
Epoch: 10
	Train Loss: 0.0147 | Train Acc: 0.9287
	 Val. Loss: 0.0248 | Val. Acc: 0.9075
Epoch: 11
	Train Loss: 0.0147 | Train Acc: 0.9337
	 Val. Loss: 0.0241 | Val. Acc: 0.9025
Epoch: 12
	Train Loss