# File 09: Prediction Sentiment Analysis on Main Dataset

### Input Files:
- model-sa/model-gpu.yaml
- model-sa/model-weights-gpu.h5
- model-sa/tokenizer.pickle
- db/04-main-data.csv
- db/08-user-rating.csv

### Output Files:
- db/09-main-prediction.csv

### Steps:
1. loading required libraries
1. loading model with weights
1. loading the tokenizer
1. loading timeline tweets
1. filter out tweets for which we dont have user rating
1. combining everything into a dataset
1. creating "X" input array 
1. using the model to predict sentiment of each tweet
1. print accuracy of prediction
1. creating final dataset
1.saving dataset

In [1]:
# loading required libraries
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm
import tensorflow as tf
from tqdm import tqdm
import matplotlib.pyplot as plt
from tensorflow.keras.models import load_model
from keras.models import model_from_yaml
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], True)

Using TensorFlow backend.


In [2]:
# loading model with weights
def load_model(model, weight) :
    with open(model, 'r') as file:
        yaml_model = file.read()
    
    model = tf.keras.models.model_from_yaml(yaml_model)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.load_weights(weight)
    
    return model

model = load_model('model-sa/model-gpu.yaml', 'model-sa/model-weights-gpu.h5')
model.summary()

  config = yaml.load(yaml_string)


Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 48, 128)           512000    
_________________________________________________________________
spatial_dropout1d_4 (Spatial (None, 48, 128)           0         
_________________________________________________________________
cu_dnnlstm_2 (CuDNNLSTM)     (None, 48, 196)           255584    
_________________________________________________________________
spatial_dropout1d_5 (Spatial (None, 48, 196)           0         
_________________________________________________________________
cu_dnnlstm_3 (CuDNNLSTM)     (None, 196)               308896    
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 394       
Total params: 1,076,874
Trainable params: 1,076,874
Non-trainable params: 0
____________________________________________

In [69]:
# loading the tokenizer
with open('model-sa/tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [70]:
# loading timeline tweets
df = pd.read_csv("db/04-main-data.csv")
user = pd.read_csv("db/08-user-rating.csv")

In [71]:
# filter out tweets for which we dont have user rating
user_names = user.USER.values.tolist()
user_rating = user.RATING.values.tolist()

array = []
for name in tqdm(user_names) :
    for row in df.loc[df['USER'] == name].values.tolist() :
        array.append(row)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4912/4912 [00:05<00:00, 835.97it/s]


In [72]:
# combining everything into a dataset
users, tweet, sentiment, rating = [], [], [], []

for row in tqdm(array) :
    users.append(row[0])
    tweet.append(row[1])
    sentiment.append(row[2])
    rating.append(int(user.loc[user['USER'] == row[0]]['RATING']))

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5863/5863 [00:02<00:00, 1986.31it/s]


In [73]:
df = pd.DataFrame(
    list(zip(users, tweet, sentiment, rating)),
    columns = ['USER', 'TWEET', 'SENTIMENT', 'RATING']
)
df = df.dropna()

In [74]:
# creating "X" array 
X = tokenizer.texts_to_sequences(df['TWEET'].values)
X = pad_sequences(X, 48)

In [75]:
# using the model to predict sentiment of each tweet
output = model.predict(X)
prediction = model.predict_classes(X)
truth = []
for row in output:
    truth.append(row[1])

In [78]:
# print accuracy of prediction
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(df['SENTIMENT'].values, prediction)
accuracy = round((cm[0, 0] + cm[1, 1]) / len(df), 3)
print(accuracy)

0.735


In [79]:
# creating final dataset
df = pd.DataFrame(
    list(zip(users, tweet, sentiment, rating, prediction, truth)),
    columns = ['USER', 'TWEET', 'SENTIMENT', 'RATING', 'PREDICTION', 'OUTPUT']
)

In [80]:
# saving dataframes
df.to_csv("db/09-main-prediction.csv", index=False)