In [60]:
import pandas as pd
import numpy as np

import torch
from torch.utils import data

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

sentiment_mapping ={
    'Positive': 1,
    'Negative': 0,
    'Neutral': 2,
    'neutral': 2
}

sentiment_mapping2 ={
    'pos': 1,
    'neg': 0,
    'neu': 2
}

sentiment_mapping_reverse ={
    1: 'Positive',
    0: 'Negative',
    2: 'Neutral'
}

In [4]:
from gensim.models import FastText as ft
word_vectors=ft.load_fasttext_format("cc.ms.300")

In [482]:

df = pd.read_excel('./data/vala_processed.xlsx')
print(df.shape)
df = df.drop_duplicates(['CONTENT'])
df = df.reset_index(drop=True)
print(df.shape)

(60115, 15)
(13260, 15)


In [483]:
df.sentiment.unique()

array(['Negative', 'Positive', 'Neutral', 'neutral'], dtype=object)

In [484]:
df['sentiment_label'] = df['sentiment'].map(lambda x: sentiment_mapping[x])
df.head()

Unnamed: 0,CONTENT,#hashtags,#urls,#mentions,#word,#capital,#pos_emojis,#neg_emojis,#emojis,#exclaimation_question,sentiment,BN,PH,PAS,General,sentiment_label
0,""" # elections news : did najib just pocket the...",2,0,0,21,0,0,0,0,1,Negative,1,0,0,0,0
1,rt rt untuk tun mahathir . love untuk najib ra...,1,0,0,12,3,0,0,0,0,Positive,1,0,0,0,1
2,rt who is arguably the best malaysia ' s prime...,0,0,0,23,2,0,0,1,1,Positive,1,0,0,0,1
3,""" a year of dread and foreboding in the malays...",0,0,0,27,1,0,0,0,0,Negative,1,0,0,0,0
4,""" mr najib may be venal , but he is not stupid...",0,0,0,35,0,0,0,0,0,Negative,1,0,0,0,0


In [485]:
error_list = []
error_index = []
for i, row in df.iterrows():
#     print(i)
    try:
        content_split = row['CONTENT'].split()
        for word in content_split:
            word_vectors.wv[word]
    except:
        error_list.append(word)
        error_index.append(i)
        
        


In [486]:
set(error_list)

{'!´',
 '!”',
 '!…',
 '"—',
 '"‘',
 '"“',
 '"…',
 "'…",
 '(…',
 ')…',
 ',”',
 ',…',
 '-‘',
 '-…',
 '.’',
 '.”',
 '.…',
 '/…',
 '0k',
 '3j',
 '5o',
 ':…',
 ';…',
 '?’',
 '?”',
 '?…',
 '`',
 'election',
 'q4',
 '~°',
 'үр',
 'עד',
 'أ',
 'َ',
 'ُ',
 'அட',
 'அன',
 'எட',
 'எவ',
 'ஓல',
 'ண',
 'ம',
 'மல',
 'ி',
 'ீ',
 'ு',
 'ூ',
 'ே',
 'ொ',
 'ั',
 'ี้',
 'ึ',
 'ุ',
 'ົ',
 'မ',
 '—…',
 '‘"',
 '‘“',
 '’,',
 '’.',
 '’:',
 '’…',
 '“#',
 '”)',
 '”,',
 '”.',
 '”:',
 '”?',
 '”…',
 '”？',
 '€“',
 '⋆',
 '⋯',
 '⒈',
 '◤“',
 '☞',
 '⚘',
 '。#',
 '。…',
 '一九',
 '即時',
 '招数',
 '蓝眼',
 '호야',
 '화',
 '️.',
 '️@',
 '）!',
 '）！',
 '）；',
 '，#',
 '🏼',
 '🏽',
 '🏾'}

In [487]:
df = df.drop(error_index)

In [488]:
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

In [489]:
train_df.shape, val_df.shape

((11573, 16), (1286, 16))

In [490]:
class TwitterDataset(data.Dataset):
    def __init__(self, df, scaler, training=True):
        self.data = df
        self.scaler = scaler
        self.tweet = self.data[['CONTENT']]
        self.training = training
        self.metadata = self.data[['#hashtags','#urls','#mentions','#word','#capital','#pos_emojis','#neg_emojis','#emojis','#exclaimation_question']]
        
        if training:
            self.party_label = self.data[['PH','BN','PAS','General']]
            self.sentiment_label = self.data[['sentiment_label']]
            self.metadata = scaler.fit_transform(self.metadata)
        else:
            self.metadata = scaler.transform(self.metadata)
        

    def __len__(self):
        return len(self.data)
    
    def prepareVector(self, sentence):
        sentence = sentence.split()
        featureVec = np.zeros(300)
        nwords = 0

        for word in sentence:
            nwords = nwords + 1
            featureVec = np.add(featureVec,word_vectors.wv[word])

        featureVec = np.divide(featureVec, nwords)
        return featureVec

    def __getitem__(self, index):
#         sentence = self.data.iloc[index, 1]
#         sentiment_label = self.data.iloc[index, 2]
#         party_label = self.data.iloc[index, 'PH','BN','PAS','General']
#         print(self.tweet.iloc[index])
        wordFeature = self.prepareVector(self.tweet.iloc[index].values[0])
        metadataFeature = self.metadata[index]
        
        feature = np.concatenate([wordFeature, metadataFeature])
        
        if self.training:
            sentLabel = self.sentiment_label.iloc[index].values
            partyLabel = self.party_label.iloc[index].values
            
            return feature, sentLabel[0], partyLabel
        else:
            return feature
        

In [498]:
scaler = StandardScaler()
train_dataset = TwitterDataset(df=train_df, scaler=scaler, training=True)
train_loader = data.DataLoader(train_dataset, batch_size=32, num_workers=4)

val_dataset = TwitterDataset(df=val_df, scaler= scaler, training=True)
val_loader = data.DataLoader(val_dataset, batch_size=32, num_workers=4)

In [499]:
feature, sentLabel, partyLabel = next(iter(train_loader))
print(feature.size(), sentLabel.size(), partyLabel.size())
print(feature.dtype, sentLabel.dtype, partyLabel.dtype)

torch.Size([32, 309]) torch.Size([32]) torch.Size([32, 4])
torch.float64 torch.int64 torch.int64


In [500]:
from torch import nn
import torch.nn.functional as F

class NeuralNet(nn.Module):
    def __init__(self):
        super(NeuralNet, self).__init__()
        self.fc1 = nn.Linear(309, 1000) 
        self.fc2 = nn.Linear(1000, 500) 
        self.fc3 = nn.Linear(500, 100)
        self.sentiment_layer = nn.Linear(100, 3)
        self.party_layer = nn.Linear(100, 4)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        sentiment_output = self.sentiment_layer(x)
        party_output = torch.sigmoid(self.party_layer(x))
        return sentiment_output, party_output

In [501]:
net = NeuralNet()

In [502]:
feature.shape

torch.Size([32, 309])

In [503]:
# net.train()
# net(feature.float())

In [504]:
sent_criterion = nn.CrossEntropyLoss()
party_criterion = nn.BCELoss()
optimizer = torch.optim.Adam(net.parameters(), lr=0.001)  

data_loaders = {'train': train_loader, 'val': val_loader}

In [505]:
num_epochs=50
total_step = len(train_loader)
for epoch in range(num_epochs):
    for phase in ['train', 'val']:
        if phase == 'train':
            for i, (features, sentLabel, partyLabel) in enumerate(data_loaders[phase]):  
                # Move tensors to the configured device
                features = features.float()
                partyLabel = partyLabel.type(torch.FloatTensor)

                # Forward pass
                output_sent, output_party = net(features)

                sent_loss = sent_criterion(output_sent, sentLabel)
                party_loss = party_criterion(output_party, partyLabel)
#                 print((output_party.round() == partyLabel).type(torch.FloatTensor).sum()/(32*4))
                loss = sent_loss + party_loss

                # Backward and optimize
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                if (i+1) % 100 == 0:
                    print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 
                           .format(epoch+1, num_epochs, i+1, total_step, loss.item()))
        else:
            with torch.no_grad():
                correct = 0
                total = 0
                party_accuracy = 0
                for i, (features, sentLabel, partyLabel) in enumerate(data_loaders[phase]):  
                    features = features.float()
                    partyLabel = partyLabel.type(torch.FloatTensor)
                    
                    output_sent, output_party = net(features)
                    
                    # calculate sentiment accuracy
                    _, predicted = torch.max(output_sent.data, 1)
                    total += sentLabel.size(0)
                    correct += (predicted == sentLabel).sum().item()
                    
                    # calculate party accuracy
                    party_accuracy += (output_party.round() == partyLabel).type(torch.FloatTensor).sum()/(32*4)
                    
                print('Validation Sentiment Accuracy: {} %'.format(100 * correct / total))
                print('Validation Party Accuracy: {} %'.format(100 * (party_accuracy/len(data_loaders[phase]))))
                

Epoch [1/50], Step [100/362], Loss: 1.0836
Epoch [1/50], Step [200/362], Loss: 1.4084
Epoch [1/50], Step [300/362], Loss: 1.2143
Validation Sentiment Accuracy: 52.33281493001555 %
Validation Party Accuracy: 85.23247528076172 %
Epoch [2/50], Step [100/362], Loss: 1.0039
Epoch [2/50], Step [200/362], Loss: 1.2811
Epoch [2/50], Step [300/362], Loss: 1.1304
Validation Sentiment Accuracy: 56.37636080870917 %
Validation Party Accuracy: 86.58536529541016 %
Epoch [3/50], Step [100/362], Loss: 0.9560
Epoch [3/50], Step [200/362], Loss: 1.1824
Epoch [3/50], Step [300/362], Loss: 1.0529
Validation Sentiment Accuracy: 58.39813374805599 %
Validation Party Accuracy: 87.32850646972656 %
Epoch [4/50], Step [100/362], Loss: 0.8987
Epoch [4/50], Step [200/362], Loss: 1.1039
Epoch [4/50], Step [300/362], Loss: 1.0118
Validation Sentiment Accuracy: 59.33125972006221 %
Validation Party Accuracy: 87.78582000732422 %
Epoch [5/50], Step [100/362], Loss: 0.8487
Epoch [5/50], Step [200/362], Loss: 1.0683
Epoch 

Epoch [37/50], Step [100/362], Loss: 0.1834
Epoch [37/50], Step [200/362], Loss: 0.2182
Epoch [37/50], Step [300/362], Loss: 0.1490
Validation Sentiment Accuracy: 63.99688958009331 %
Validation Party Accuracy: 88.64329528808594 %
Epoch [38/50], Step [100/362], Loss: 0.0768
Epoch [38/50], Step [200/362], Loss: 0.1250
Epoch [38/50], Step [300/362], Loss: 0.0531
Validation Sentiment Accuracy: 66.40746500777605 %
Validation Party Accuracy: 88.52896118164062 %
Epoch [39/50], Step [100/362], Loss: 0.0901
Epoch [39/50], Step [200/362], Loss: 0.1359
Epoch [39/50], Step [300/362], Loss: 0.1380
Validation Sentiment Accuracy: 63.919129082426124 %
Validation Party Accuracy: 89.11966705322266 %
Epoch [40/50], Step [100/362], Loss: 0.0746
Epoch [40/50], Step [200/362], Loss: 0.2301
Epoch [40/50], Step [300/362], Loss: 0.0466
Validation Sentiment Accuracy: 65.0855365474339 %
Validation Party Accuracy: 88.94817352294922 %
Epoch [41/50], Step [100/362], Loss: 0.0812
Epoch [41/50], Step [200/362], Loss:

In [48]:
torch

In [334]:
net.state_dict()


OrderedDict([('fc1.weight',
              tensor([[ 0.1771,  0.3051,  0.2256,  ..., -0.0169, -0.0924,  0.0713],
                      [-0.0884, -0.3738,  0.2825,  ...,  0.0090, -0.0578,  0.2121],
                      [ 0.6976, -0.1008, -0.3517,  ..., -0.0367, -0.0256,  0.0636],
                      ...,
                      [ 0.0649,  0.1610, -0.9014,  ...,  0.1054,  0.1381,  0.1164],
                      [ 0.0152, -0.7118,  0.0167,  ...,  0.0539,  0.0582,  0.1090],
                      [ 0.1407,  0.0260,  0.0666,  ...,  0.0172, -0.1435, -0.3649]])),
             ('fc1.bias',
              tensor([-0.1245, -0.5631, -0.3903, -0.2231, -0.1393, -0.4189, -0.0581, -0.2770,
                      -0.3721, -0.2303, -0.0339, -0.1062, -0.3938, -0.0569, -0.4054, -0.3984,
                      -0.3965, -0.8722, -0.1925, -0.1308, -0.4791, -0.2916, -0.2130, -0.0547,
                      -0.3590, -0.2687, -0.2527, -0.1211, -0.1169, -0.2237, -0.2332, -0.1648,
                      -0.2108, -0.28

In [335]:
# torch.save(net.state_dict(), 'twitter_sentiment_model.pth')

# Trained on full dataset

In [473]:
df2 = pd.read_excel('./data/vala_processed.xlsx')
print(df2.shape)
df2 = df2.drop_duplicates(['CONTENT'])
df2 = df2.reset_index(drop=True)
print(df2.shape)

(60115, 15)
(13260, 15)


In [476]:
df2.tail()

Unnamed: 0,CONTENT,#hashtags,#urls,#mentions,#word,#capital,#pos_emojis,#neg_emojis,#emojis,#exclaimation_question,sentiment,BN,PH,PAS,General,sentiment_label
13255,statement agong malaysia disaksikan panglima t...,0,0,0,38,0,0,0,0,0,Negative,1,1,0,0,0
13256,suruhanjaya pilihan raya malaysia kenyataan me...,0,0,0,28,14,0,0,0,0,Negative,0,1,0,0,0
13257,"telah berlangsung sebentar tadi , lelongan jam...",0,0,0,30,2,0,0,0,0,Negative,0,1,0,0,0
13258,muar - ketua pemuda parti pribumi bersatu mala...,0,0,0,26,3,0,0,0,0,Positive,0,1,0,0,1
13259,this pru14 ge14 is the right time to iss the s...,2,0,0,36,4,1,0,1,0,Negative,1,1,0,0,0


In [477]:
# df2 = pd.read_excel('./data/vala_processed.xlsx')
df2['sentiment_label'] = df2['sentiment'].map(lambda x: sentiment_mapping[x])

error_list2 = []
error_index2 = []
for i, row in df2.iterrows():
#     print(i)
    try:
        content_split = row['CONTENT'].split()
        for word in content_split:
            word_vectors.wv[word]
    except:
        error_list2.append(word)
        error_index2.append(i)
        
df2 = df2.drop(error_index2)

In [478]:
net2 = NeuralNet()
sent_criterion = nn.CrossEntropyLoss()
party_criterion = nn.BCELoss()
optimizer = torch.optim.Adam(net2.parameters(), lr=0.001)  

# data_loaders = {'train': train_loader, 'val': val_loader}

In [479]:
scaler = StandardScaler()
train_dataset = TwitterDataset(df=df2, scaler=scaler, training=True)
train_loader = data.DataLoader(train_dataset, batch_size=64, num_workers=4)


In [480]:
# load model
# net2.load_state_dict(torch.load('twitter_sentiment_model_all.pth'))

In [481]:
num_epochs=50
total_step = len(train_loader)
for epoch in range(num_epochs):
    correct = 0
    total = 0
    party_accuracy = 0
    for i, (features, sentLabel, partyLabel) in enumerate(train_loader):  
        # Move tensors to the configured device
        features = features.float()
        partyLabel = partyLabel.type(torch.FloatTensor)

        # Forward pass
        output_sent, output_party = net2(features)

        sent_loss = sent_criterion(output_sent, sentLabel)
        party_loss = party_criterion(output_party, partyLabel)
#                 print((output_party.round() == partyLabel).type(torch.FloatTensor).sum()/(32*4))
        loss = sent_loss + party_loss
    
        # calculate sentiment accuracy
        _, predicted = torch.max(output_sent.data, 1)
        total += sentLabel.size(0)
        correct += (predicted == sentLabel).sum().item()
        
#         print((output_party.round() == partyLabel).type(torch.FloatTensor).sum()/(64*4))
        party_accuracy += (output_party.round() == partyLabel).type(torch.FloatTensor).sum()/(64*4)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (i+1) % 100 == 0:
            print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 
                   .format(epoch+1, num_epochs, i+1, total_step, loss.item()))
        
    print('Training Sentiment Accuracy: {} %'.format(100 * correct / total))
    print('Training Party Accuracy: {} %'.format(100 * (party_accuracy/len(train_loader))))
        

Epoch [1/50], Step [100/201], Loss: 1.4005
Epoch [1/50], Step [200/201], Loss: 1.3808
Training Sentiment Accuracy: 52.243564818415116 %
Training Party Accuracy: 85.7956314086914 %
Epoch [2/50], Step [100/201], Loss: 1.3597
Epoch [2/50], Step [200/201], Loss: 1.3608
Training Sentiment Accuracy: 54.81763745236799 %
Training Party Accuracy: 86.02494812011719 %
Epoch [3/50], Step [100/201], Loss: 1.3538
Epoch [3/50], Step [200/201], Loss: 1.3967
Training Sentiment Accuracy: 54.78653083443503 %
Training Party Accuracy: 86.2348403930664 %
Epoch [4/50], Step [100/201], Loss: 1.3009
Epoch [4/50], Step [200/201], Loss: 1.3500
Training Sentiment Accuracy: 53.99331207714441 %
Training Party Accuracy: 87.13075256347656 %
Epoch [5/50], Step [100/201], Loss: 1.2628
Epoch [5/50], Step [200/201], Loss: 1.3246
Training Sentiment Accuracy: 53.99331207714441 %
Training Party Accuracy: 87.4475326538086 %
Epoch [6/50], Step [100/201], Loss: 1.2207
Epoch [6/50], Step [200/201], Loss: 1.3406
Training Sentime

KeyboardInterrupt: 

In [357]:
# torch.save(net2.state_dict(), 'twitter_sentiment_model_all.pth')

In [387]:
#test
df_test = pd.read_excel('./data/issues_processed_v2.xlsx')
df_test.columns = ['CONTENT', '#hashtags', '#urls', '#mentions', '#word', '#capital',
       '#pos_emojis', '#neg_emojis', '#emojis', '#exclaimation_question',
       'keyword']

In [388]:
df_test.head()

Unnamed: 0,CONTENT,#hashtags,#urls,#mentions,#word,#capital,#pos_emojis,#neg_emojis,#emojis,#exclaimation_question,keyword
0,perdaftaran bantuan sara hidup ( bsh ) akan di...,3,0,0,18,1,0,0,0,0,Budget2019
1,rt perdaftaran bantuan sara hidup ( bsh ) akan...,2,0,0,18,2,0,0,0,0,Budget2019
2,rt perdaftaran bantuan sara hidup ( bsh ) akan...,2,0,0,18,2,0,0,0,0,Budget2019
3,rt perdaftaran bantuan sara hidup ( bsh ) akan...,2,0,0,18,2,0,0,0,0,Budget2019
4,rt perdaftaran bantuan sara hidup ( bsh ) akan...,2,0,0,18,2,0,0,0,0,Budget2019


In [392]:
error_list_test = []
error_index_test = []
for i, row in df_test.iterrows():
#     print(i)
    try:
        content_split = row['CONTENT'].split()
        for word in content_split:
            word_vectors.wv[word]
    except:
        error_list_test.append(word)
        error_index_test.append(i)

In [442]:
df_test = df_test.drop(error_index_test)
df_test = df_test.reset_index(drop=True)

In [397]:
test_dataset = TwitterDataset(df=df_test, scaler=scaler, training=False)
test_loader = data.DataLoader(test_dataset, batch_size=64, num_workers=4)

In [398]:
next(iter(test_loader))

tensor([[ 0.0143, -0.0067,  0.0039,  ..., -0.0399, -0.1628, -0.3705],
        [ 0.0079, -0.0030,  0.0018,  ..., -0.0399, -0.1628, -0.3705],
        [ 0.0079, -0.0030,  0.0018,  ..., -0.0399, -0.1628, -0.3705],
        ...,
        [ 0.0079, -0.0030,  0.0018,  ..., -0.0399, -0.1628, -0.3705],
        [ 0.0082, -0.0029,  0.0019,  ..., -0.0399, -0.1628, -0.3705],
        [ 0.0143, -0.0067,  0.0039,  ..., -0.0399, -0.1628, -0.3705]],
       dtype=torch.float64)

In [423]:
predicted_sent_list = []
predicted_party_list = []

with torch.no_grad():
    for i, (features) in enumerate(test_loader):  
        features = features.float()
        partyLabel = partyLabel.type(torch.FloatTensor)

        output_sent, output_party = net2(features)

        _, predicted_sent = torch.max(output_sent.data, 1)
        predicted_party = output_party.round()
        
        predicted_party_list+=predicted_party.numpy().tolist()
        predicted_sent_list+=predicted_sent.numpy().tolist()
#         print(predicted_party_list)
        
#         predicted_sent_list.append(predicted_sent)
#         predicted_party_list.append(predicted_party)

In [434]:
df_pred_party = pd.DataFrame(np.vstack(predicted_party_list).astype('int'))
df_pred_party.columns = ['BN','PH','PAS','General']
df_pred_party.head()

Unnamed: 0,BN,PH,PAS,General
0,0,0,0,1
1,0,0,0,1
2,0,0,0,1
3,0,0,0,1
4,0,0,0,1


In [435]:
df_pred_sent = pd.DataFrame(np.vstack(predicted_sent_list).astype('int'))
df_pred_sent.columns = ['sentiment']
df_pred_sent.head()

Unnamed: 0,sentiment
0,2
1,2
2,2
3,2
4,2


In [455]:
df_test_predicted = pd.concat([df_test, df_pred_sent, df_pred_party], sort=False, axis=1)
df_test_predicted['sentiment'] = df_test_predicted['sentiment'].map(lambda x: sentiment_mapping_reverse[x])

In [456]:
df_test_predicted.head()

Unnamed: 0,CONTENT,#hashtags,#urls,#mentions,#word,#capital,#pos_emojis,#neg_emojis,#emojis,#exclaimation_question,keyword,sentiment,BN,PH,PAS,General
0,perdaftaran bantuan sara hidup ( bsh ) akan di...,3,0,0,18,1,0,0,0,0,Budget2019,Neutral,0,0,0,1
1,rt perdaftaran bantuan sara hidup ( bsh ) akan...,2,0,0,18,2,0,0,0,0,Budget2019,Neutral,0,0,0,1
2,rt perdaftaran bantuan sara hidup ( bsh ) akan...,2,0,0,18,2,0,0,0,0,Budget2019,Neutral,0,0,0,1
3,rt perdaftaran bantuan sara hidup ( bsh ) akan...,2,0,0,18,2,0,0,0,0,Budget2019,Neutral,0,0,0,1
4,rt perdaftaran bantuan sara hidup ( bsh ) akan...,2,0,0,18,2,0,0,0,0,Budget2019,Neutral,0,0,0,1


In [459]:
# df_test_predicted.to_excel("./data/issues_processed_v2_prediction.xlsx") 

# New section

In [17]:
df1 = pd.read_excel('./data/Tweet sentiment/TAGGED_normalised_BR1M_1202.xlsx')
df2 = pd.read_excel('./data/Tweet sentiment/TAGGED_normalised_Budget2019_1202.xlsx')
df3 = pd.read_excel('./data/Tweet sentiment/TAGGED_normalised_ICERD_1202.xlsx')
df4 = pd.read_excel('./data/Tweet sentiment/TAGGED_normalised_PRKCameronHighlands_1202.xlsx')
df5 = pd.read_excel('./data/Tweet sentiment/TAGGED_normalised_PRU14_1202.xlsx')
df6 = pd.read_excel('./data/Tweet sentiment/TAGGED_normalised_SST_1202.xlsx')

In [26]:
df = pd.concat([df1, df2, df3, df4, df5, df6], axis=0)

In [19]:
df_list = [df1, df2, df3, df4, df5, df6]

In [20]:
for i in df_list:
    print(i.columns.tolist())

['full_text', 'text4', 'Sentiment', 'Remark', 'Tagged by', '#hashtags', '#urls', '#mentions', '#word', '#capital', '#pos_emojis', '#neg_emojis', '#emojis', '#exclaimation_question']
['full_text', 'text4', 'Sentiment', 'Remark', 'Tagged by', '#hashtags', '#urls', '#mentions', '#word', '#capital', '#pos_emojis', '#neg_emojis', '#emojis', '#exclaimation_question']
['full_text', 'text4', 'Sentiment', 'Remark', 'Tagged by', '#hashtags', '#urls', '#mentions', '#word', '#capital', '#pos_emojis', '#neg_emojis', '#emojis', '#exclaimation_question']
['full_text', 'text4', 'Sentiment', 'Remark', 'Tagged by', '#hashtags', '#urls', '#mentions', '#word', '#capital', '#pos_emojis', '#neg_emojis', '#emojis', '#exclaimation_question']
['full_text', 'text4', 'Sentiment', 'Remark', 'Tagged by', '#hashtags', '#urls', '#mentions', '#word', '#capital', '#pos_emojis', '#neg_emojis', '#emojis', '#exclaimation_question']
['full_text', 'text4', 'Sentiment', 'Remark', 'Tagged by', '#hashtags', '#urls', '#mention

In [27]:
df.shape

(5407, 14)

In [29]:
df.tail()

Unnamed: 0,full_text,text4,Sentiment,Remark,Tagged by,#hashtags,#urls,#mentions,#word,#capital,#pos_emojis,#neg_emojis,#emojis,#exclaimation_question
2443,Tun M memulakan ucapan di Kuala Kedah. https:/...,tun memulakan ucapan di kuala kedah <url>,,,,0,0,0,9,1,0,0,0,0
2444,Dulu kita miskin. Semasa kita berjuang masa na...,dulu kita miskin semasa kita berjuang masa nak...,,,,0,0,0,27,1,0,0,0,0
2445,Najib sudah mengikat negara China. Kita baik t...,najib sudah mengikat negara china kita baik ta...,,,,0,0,0,27,1,0,0,0,1
2446,RT @SumishaCNA: Opposition needs to form a coa...,opposition needs to form coalition to remove p...,,,,0,0,1,19,3,0,0,0,0
2447,10. Diperingkat akar umbi dan juga segelintir ...,diperingkat akar umbi dan juga segelintir pemi...,,,,0,0,0,24,2,0,0,0,0


In [92]:
df['Sentiment'].unique()
# df['Tagged by'].unique()

array(['neg', 'neu', 'pos', nan, 'positive'], dtype=object)

In [89]:
df = df[~(df['Tagged by']=='akmal')]

In [94]:
df.loc[df['Sentiment']=='new','Sentiment'] = 'neu'
df.loc[df['Sentiment']=='neu/pos','Sentiment'] = 'pos'
df.loc[df['Sentiment']=='neu/positive', 'Sentiment'] = 'pos'
df.loc[df['Sentiment']=='neu/neg', 'Sentiment'] = 'pos' 
df.loc[df['Sentiment']=='positive', 'Sentiment'] = 'pos'

In [95]:
dff = df[~df['Sentiment'].isna()]
dff['sentiment_label'] = dff['Sentiment'].map(lambda x: sentiment_mapping2[x])
dff.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,full_text,text4,Sentiment,Remark,Tagged by,#hashtags,#urls,#mentions,#word,#capital,#pos_emojis,#neg_emojis,#emojis,#exclaimation_question,sentiment_label
0,RT @monamalaysia: @shonzu11 Jangan merendahkan...,jangan merendahkan kebolehan pm memerintah den...,neg,sarcasm,Keng Hooi,0,0,2,54,2,0,0,0,0,0
1,"RT @Nasionalis_: Jika BR1M itu ibarat candu, m...",jika br1m itu ibarat candu maka bsh bantuan sa...,neg,,Keng Hooi,0,0,1,23,2,0,0,0,1,0
2,RT @myintifada: Look at ridiculously IDIOT PH ...,look at ridiculously idiot ph government use o...,neg,,Keng Hooi,0,0,1,64,5,0,0,0,1,0
3,Sokongan masyarakat diperlukan utk membantu pi...,sokongan masyarakat diperlukan untuk membantu ...,neu,,Keng Hooi,0,0,0,18,2,0,0,0,0,2
4,RT @BetterNation3: @Nazgul71028348 @spender_bi...,sekiranya br1m diibaratkan sebagai candu adaka...,neg,,Keng Hooi,0,0,10,72,1,0,0,0,0,0


In [96]:
error_list2 = []
error_index2 = []
for i, row in dff.iterrows():
    try:
        content_split = row['text4'].split()
        for word in content_split:
            word_vectors.wv[word]
    except:
        error_list2.append(word)
        error_index2.append(i)
        
dff2 = dff.drop(error_index2)

In [97]:
class TwitterDataset(data.Dataset):
    def __init__(self, df, scaler, training=True, normalize=True):
        self.data = df
        self.scaler = scaler
        self.tweet = self.data[['text4']]
        self.training = training
        self.metadata = self.data[['#hashtags','#urls','#mentions','#word','#capital','#pos_emojis','#neg_emojis','#emojis','#exclaimation_question']]
        
        if normalize:
            self.metadata = scaler.fit_transform(self.metadata)
        else:
            self.metadata = scaler.transform(self.metadata)
            
        if training:
#             self.party_label = self.data[['PH','BN','PAS','General']]
            self.sentiment_label = self.data[['sentiment_label']]
    

    def __len__(self):
        return len(self.data)
    
    def prepareVector(self, sentence):
        sentence = sentence.split()
        featureVec = np.zeros(300)
        nwords = 0

        for word in sentence:
            nwords = nwords + 1
            featureVec = np.add(featureVec,word_vectors.wv[word])

        featureVec = np.divide(featureVec, nwords)
        return featureVec

    def __getitem__(self, index):
        wordFeature = self.prepareVector(self.tweet.iloc[index].values[0])
        metadataFeature = self.metadata[index]
        
        feature = np.concatenate([wordFeature, metadataFeature])
        
        if self.training:
            sentLabel = self.sentiment_label.iloc[index].values
#             partyLabel = self.party_label.iloc[index].values
            
            return feature, sentLabel[0]
        else:
            return feature
        

In [98]:
from torch import nn
import torch.nn.functional as F

class NeuralNet(nn.Module):
    def __init__(self):
        super(NeuralNet, self).__init__()
        self.fc1 = nn.Linear(309, 1000) 
        self.fc2 = nn.Linear(1000, 500) 
        self.fc3 = nn.Linear(500, 100)
        self.sentiment_layer = nn.Linear(100, 3)
        self.party_layer = nn.Linear(100, 4)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        sentiment_output = self.sentiment_layer(x)
        party_output = torch.sigmoid(self.party_layer(x))
        return sentiment_output, party_output

In [100]:
train_df, val_df = train_test_split(dff2, test_size=0.1, random_state=42)

In [101]:
scaler = StandardScaler()
train_dataset = TwitterDataset(df=train_df, scaler=scaler, training=True, normalize=True)
train_loader = data.DataLoader(train_dataset, batch_size=32, num_workers=4)

val_dataset = TwitterDataset(df=val_df, scaler= scaler, training=True, normalize=False)
val_loader = data.DataLoader(val_dataset, batch_size=32, num_workers=4)

In [109]:
net2 = NeuralNet()
sent_criterion = nn.CrossEntropyLoss()
party_criterion = nn.BCELoss()
optimizer = torch.optim.Adam(net2.parameters(), lr=0.001)  

data_loaders = {'train': train_loader, 'val': val_loader}# train_dataset[0]

In [111]:
num_epochs=50
total_step = len(train_loader)
for epoch in range(num_epochs):
    for phase in ['train', 'val']:
        if phase == 'train':
            for i, (features, sentLabel) in enumerate(data_loaders[phase]):  
                # Move tensors to the configured device
                features = features.float()

                # Forward pass
                output_sent, output_party = net2(features)

                sent_loss = sent_criterion(output_sent, sentLabel)
                loss = sent_loss 

                # Backward and optimize
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                if (i+1) % 100 == 0:
                    print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 
                           .format(epoch+1, num_epochs, i+1, total_step, loss.item()))
        else:
            with torch.no_grad():
                correct = 0
                total = 0
                party_accuracy = 0
                for i, (features, sentLabel) in enumerate(data_loaders[phase]):  
                    features = features.float()
                    
                    output_sent, output_party = net2(features)
                    
                    # calculate sentiment accuracy
                    _, predicted = torch.max(output_sent.data, 1)
                    total += sentLabel.size(0)
                    correct += (predicted == sentLabel).sum().item()
                    
                    
                print('Validation Sentiment Accuracy: {} %'.format(100 * correct / total))
                

Validation Sentiment Accuracy: 52.12765957446808 %
Validation Sentiment Accuracy: 55.851063829787236 %
Validation Sentiment Accuracy: 56.91489361702128 %
Validation Sentiment Accuracy: 60.1063829787234 %
Validation Sentiment Accuracy: 60.1063829787234 %
Validation Sentiment Accuracy: 64.8936170212766 %
Validation Sentiment Accuracy: 63.297872340425535 %
Validation Sentiment Accuracy: 66.48936170212765 %
Validation Sentiment Accuracy: 65.95744680851064 %
Validation Sentiment Accuracy: 64.8936170212766 %
Validation Sentiment Accuracy: 61.702127659574465 %
Validation Sentiment Accuracy: 61.170212765957444 %
Validation Sentiment Accuracy: 60.1063829787234 %
Validation Sentiment Accuracy: 60.638297872340424 %
Validation Sentiment Accuracy: 62.234042553191486 %
Validation Sentiment Accuracy: 60.638297872340424 %
Validation Sentiment Accuracy: 59.04255319148936 %
Validation Sentiment Accuracy: 61.702127659574465 %
Validation Sentiment Accuracy: 63.297872340425535 %
Validation Sentiment Accura

In [118]:
# load test set
test_df = pd.read_excel('./data/issues_processed_v2_prediction.xlsx')
test_df.columns = ['text4', '#hashtags', '#urls', '#mentions', '#word', '#capital',
       '#pos_emojis', '#neg_emojis', '#emojis', '#exclaimation_question',
       'keyword', 'sentiment', 'BN', 'PH', 'PAS', 'General']

In [119]:
test_dataset = TwitterDataset(df=test_df, scaler=scaler, training=False, normalize=False)
test_loader = data.DataLoader(test_dataset, batch_size=64, num_workers=4)

array([ 1.43284910e-02, -6.68050930e-03,  3.92305043e-03,  2.90000593e-03,
        1.50832787e-02, -8.47821523e-02,  1.47555937e-02, -9.06560340e-03,
       -3.63997864e-03, -1.88522477e-02,  8.24395494e-03, -2.51325851e-03,
        2.87103008e-02,  1.82489220e-02, -1.65291102e-02,  8.18374134e-03,
       -6.53118748e-03,  4.59973937e-03,  9.18282275e-03, -9.32116907e-03,
        2.32980924e-03, -2.59659115e-03,  8.41681514e-03, -5.76652131e-03,
        1.56107316e-02,  1.50073552e-02, -2.38437614e-02, -7.81121353e-03,
        2.32750522e-02,  2.00437118e-03, -5.41383905e-03, -7.58255519e-03,
        7.84398056e-03,  2.22738192e-02, -6.73363723e-03, -3.06394815e-03,
       -1.56717141e-02,  6.70031434e-03,  1.26784304e-02, -2.57465256e-03,
       -9.64734219e-03, -4.74523512e-03, -8.41431072e-03,  1.14093381e-02,
       -6.66123070e-04,  1.19591735e-02,  1.06282565e-02, -9.07487262e-03,
        1.07270383e-02, -1.42936123e-02,  9.69155639e-03, -5.23357181e-04,
        2.21079667e-02, -

In [130]:
predicted_sent_list = []

with torch.no_grad():
    for i, (features) in enumerate(test_loader):  
        features = features.float()

        output_sent, output_party = net2(features)

        _, predicted_sent = torch.max(output_sent.data, 1)
        
        predicted_sent_list+=predicted_sent.numpy().tolist()

In [124]:
df_pred_sent = pd.DataFrame(np.vstack(predicted_sent_list).astype('int'))
df_pred_sent.columns = ['sentiment2']
df_pred_sent.head()

Unnamed: 0,sentiment2
0,2
1,2
2,2
3,2
4,2


In [126]:
df_test_predicted = pd.concat([test_df, df_pred_sent], sort=False, axis=1)
df_test_predicted['sentiment2'] = df_test_predicted['sentiment2'].map(lambda x: sentiment_mapping_reverse[x])

In [131]:
# df_test_predicted

In [129]:
df_test_predicted.to_excel("./data/issues_processed_v2_prediction_v2.xlsx") 