# Sentiment Analysis
## 1. Data Pre-Processing

In [2]:
import re

import gensim.models
import nltk
import pandas as pd
import numpy as np
from nltk.tokenize import TweetTokenizer
from nltk.corpus import words
nltk.download('words')
import emoji
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet

from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import f1_score, classification_report
from sklearn.preprocessing import OneHotEncoder

# from sklearn.externals import joblib
from gensim.models import Word2Vec

from translate import Translator

nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
stoplist = set(stopwords.words('english'))
wnl = WordNetLemmatizer()

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\ZhuoY\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\words.zip.


ModuleNotFoundError: No module named 'translate'

In [None]:
testmsgs=[
    'Where are these salafi innovators shouting shirk? https://t.co/3gGv14ILc0,2',
    'Paris warns radicals are trying to exploit Yellow Vests &amp; overthrow the... https://t.co/3Y4Whnsctw,2',
    '"Hello \n.\n.\n.\n.\n.\n.\n.\n #selfie #blackgirlmagic #loveyourself #london #shoreditch #eastlondon #melanin #melaninpoppin #muslimah #influencer #photography #artist #art #beautiful #designer… https://t.co/swIqxdWvyF",2',
    '"Plakataktion zur Thematik „Muslime gegen Rasissmus“ von 22.10.2021 - 01.11.2021 ',
    '@KbfMajor @Nigel_Farage 👇watch my gallery👇 That’s the problem with being thick as two short planks. Whether it’s an Islamic Fundamentalist :) in the streets or Nigel Führage on Twitter or GBeebies; it’s the same thing. Both are equally as abhorrent.,2'

]

In [None]:

def remove_url(raw_text):
    restr=r"(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b"
    text_noURL=re.sub(restr," ", raw_text,flags=re.MULTILINE)
    return text_noURL

def deEmojize(text):
    return emoji.demojize(text)

def translate(text,from_lang='en'):
    def get_lang(tag):
        if tag.startswith('de'):
            return 'german'
        else:
            return 'english'
    translator= Translator(from_lang=get_lang(from_lang),to_lang='english')
    return translator.translate(text)

 # Visual inspection of the above function working on testmsg
# for item in testmsgs:
#     print(translate(item))


In [18]:
def extract_words(text):
    '''
    Remove punctuation ; lowercase words and remove words which consists of less than 2 alphabets
    :param text: a sentence
    :return: word list of input sentence
    '''
    tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)
    raw_words=tknzr.tokenize(text)
    tokenized_words=[word.lower() for word in raw_words if len(word)>2 and re.search(r'[a-zA-Z]+',word) ]
    return tokenized_words

 # Visual inspection of the above function working on testmsg
# for item in testmsgs:
#     result= extract_words(item)
#     display(' '.join(result))

In [17]:

def get_pos_word(words):
    '''

    :param words:
    :return:
    '''
    def get_wordnet_pos(tag):
        if tag.startswith('J'):
            return wordnet.ADJ
        elif tag.startswith('V'):
            return wordnet.VERB
        elif tag.startswith('N'):
            return wordnet.NOUN
        elif tag.startswith('R'):
            return wordnet.ADV
        else:
            return None

    # label POS for words
    words = pos_tag(words)

    # lemmatize words
    pos_word = [wnl.lemmatize(tag[0], pos=get_wordnet_pos(tag[1]) or wordnet.NOUN) for tag in words]

    # remove stopwords
    cleanwords = [word for word in pos_word if word not in stoplist]

    return cleanwords


In [None]:
def remove_nonsense(cleanwords:list):
    sentslen=0
    for word in cleanwords:
        if re.search(r'\B(\#[a-zA-Z]+\b)(?!;)',word):
            continue
        sentslen+=1
    if sentslen<4:
        cleanwords=list('#######' for i in range(len(cleanwords)))
    return cleanwords
            

def tokenize_words(sentence:str):

    # remove url
    noURLsent=remove_url(sentence)

    # translate emoji to word
    deEmojisent=deEmojize(noURLsent)

    # lowercase words and remove words which consists of less than 2 alphabets
    words=extract_words(deEmojisent)

    # label POS for words for stemming them and removing stopwords
    cleanwords=get_pos_word(words)
    
    vaildwords=remove_nonsense(cleanwords)


    return ' '.join(vaildwords)

def translate_tweets(raw_data:pd.DataFrame):
    translated_tweets=[]
    langs=raw_data['lang'].to_numpy()
    for index,lang in enumerate(langs):

        tweet=raw_data.loc[index]['content']
        
        if lang !='en':
            tweet=remove_url(tweet)
            words=extract_words(tweet)
            tweet=' '.join(words)
            translated_tweets.append(translate(tweet,lang))
#             translated_tweets.append(' ')

        else:
            translated_tweets.append(tweet)

    return translated_tweets


In [None]:
TA_TRAIN=500
TA_TEST=0
def preprocessing(train_path:str,trial_path:str,storepath=None,translate_tag=False):

    # Loading raw data from the file
    raw_data_train=pd.read_csv(train_path)
    raw_data_trial=pd.read_csv(trial_path)


    global TA_TRAIN
    TA_TRAIN=raw_data_train.shape[0]
    global TA_TEST
    TA_TEST=raw_data_trial.shape[0]

    
    raw_data=pd.concat([raw_data_train,raw_data_trial])
    raw_data=raw_data.reset_index(drop=True)
    # Extract engish words from sentence

    if(translate_tag==True):
        translated_tweets=translate_tweets(raw_data)
        raw_data['translated_tweets']=translated_tweets        

    raw_data['tokenized_content']=raw_data['translated_tweets'].apply(tokenize_words)
    
    if(storepath!=None):
        raw_data.to_csv(storepath)


#     raw_data['tokenized_content']=raw_data['content'].apply(tokenize_words)


    # Remove empyte row data
#     raw_data = raw_data[~(raw_data['tokenized_content'].str.len() == 0)]
    
    return raw_data


## 2. Training Model

In [None]:
def vectorize_tweet(data:list,model_w2v:Word2Vec,vector_size=200):

    def word_vector(token:list,model_w2v:Word2Vec,verctor_size:int):
        vec = np.zeros(verctor_size).reshape((1, vector_size))
        count = 0
        for word in token:
            try:
                vec += model_w2v.wv.get_vector(word).reshape((1, vector_size))
                count += 1.
            except KeyError:  # handling the case where the token is not in vocabulary
                continue
        if count != 0:
            vec /= count
        return vec

    wordvec_arrays = np.zeros((len(data), vector_size))
    for i in range(len(data)):
        wordvec_arrays[i,:] = word_vector(data[i], verctor_size=vector_size, model_w2v=model_w2v)
    wordvec_df = pd.DataFrame(wordvec_arrays)
    return wordvec_df


In [None]:
def word2ved_model_train(data:list,vector_size=200):
    '''
    :param data: [[tokens_sent_1],[tokens_sent_2]...]
    :return:
    '''

    model_w2v=gensim.models.Word2Vec(
        data,
        vector_size=vector_size,
        window=5,
        min_count=2
    )

    model_w2v.train(data,total_examples=len(data),epochs=20)
    model_path='./src/model/model_'+str(len(model_w2v.wv.index_to_key))+'_words_'+str(len(data))+'tws.model'
    model_w2v.save(model_path)

    return model_path

In [62]:
TRAINSIZE=0.8
DEVSIZE=0.2
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
def splitDataset(data:pd.DataFrame):
    '''

    :param data:
    :return: 5 lists
    '''
    data=raw_data

    train_df=data[:TA_TRAIN]
    trial_df=data[TA_TRAIN:]

    x_train=train_df['tokenized_content'].loc[:int(len(train_df)*TRAINSIZE)]
    x_vaild=train_df['tokenized_content'].loc[int(len(train_df)*TRAINSIZE):]

    x_test=trial_df['tokenized_content']

    y_train=train_df['label'].loc[:int(len(train_df)*TRAINSIZE)].to_numpy().tolist()
    y_vaild=train_df['label'].loc[int(len(train_df)*DEVSIZE):].to_numpy().tolist()


    return x_train, x_vaild, x_test, y_train, y_vaild

def finetune(xtrain_w2v_np,ytrain_np,xvalid_w2v_np,yvaild_np):
    '''
    Using GridSearchCV to find best hyperparamters for svm
    :param xtrain_w2v_np:
    :param ytrain_np:
    :param xvalid_w2v_np:
    :param yvaild_np:
    :return:
    '''
    param_grid = {'C': [0.0125,0.01,0.05,0.08,0.1],
              'gamma': [0.1,0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf','linear','poly']}
    grid = GridSearchCV(SVC(degree=3,decision_function_shape='ovr'), param_grid, refit = True, verbose = 3)

    # fitting the model for grid search
    grid.fit(xtrain_w2v_np, ytrain_np)

    # print best parameter after tuning
    print('Best paragrams: ',grid.best_params_)

    # print how our model looks after hyper-parameter tuning
    print(grid.best_estimator_)

    grid_predictions = grid.predict(xvalid_w2v_np)

    # print classification report
    print(classification_report(yvaild_np, grid_predictions))

def word2vec_svm(data:pd.DataFrame,vector_size=200,finetuned=False):
    '''
    Train a word2vec model, then train svm classifier based on the word embedding model
    :param data:
    :param vector_size:
    :return:
    '''

    # split dataset into trainset, validset and testset
    x_train, x_vaild, x_test, y_train, y_vaild=splitDataset(data)

    # concentrate tweets from all sets to train word2vec model
    combined=pd.concat([x_train,x_vaild,x_test])
    combined=list(combined.to_numpy())

    # train a word2vec model and return its stored path
    model_path=word2ved_model_train(combined,vector_size=vector_size)

    model_w2v=Word2Vec.load(model_path)

    # vectorize sentences
    wordvec_df=vectorize_tweet(combined,model_w2v,vector_size=vector_size)

    # split sentences vector matrx into trainset, validset and testset
    train_w2v = wordvec_df.iloc[:x_train.shape[0]+x_vaild.shape[0],:]
    test_w2v = wordvec_df.iloc[x_train.shape[0]+x_vaild.shape[0]:,:]
    xtrain_w2v = train_w2v.iloc[y_train,:]
    xvalid_w2v = train_w2v.iloc[y_vaild,:]

    # convert dataformat from dataframe to numpy
    xtrain_w2v_np=xtrain_w2v.to_numpy()
    xvalid_w2v_np=xvalid_w2v.to_numpy()
    ytrain_np=np.array(y_train)
    yvaild_np=np.array(y_vaild)
    test_w2v_np=test_w2v.to_numpy()

    # Using GridSearchCV to finetune model
    if finetuned:
        finetune(xtrain_w2v_np,ytrain_np,xvalid_w2v_np,yvaild_np)

    # Using one versus rest decision_function provided by sklearn
    svc=OneVsRestClassifier(SVC(kernel='linear',degree=3, C=0.07,gamma=0.1,decision_function_shape='ovr',random_state=20)).fit(xtrain_w2v_np, ytrain_np)

    # Predict valid dataset by trained classifier svc
    y_pred_valid=svc.predict(xvalid_w2v_np)
    y_pred_valid=y_pred_valid.astype(int)

    print(classification_report(yvaild_np, y_pred_valid))

    # Predict test dataset by trained classifier svc
    pred_test=svc.predict(test_w2v_np)
    pred_test=pred_test.astype(int)

    countclasses(pred_test)

    return pred_test


sd=word2vec_svm(raw_data,vector_size=300)

              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00        34
         1.0       1.00      1.00      1.00        20
         2.0       0.91      1.00      0.95       346

    accuracy                           0.92       400
   macro avg       0.64      0.67      0.65       400
weighted avg       0.84      0.92      0.87       400

zero 0
one 37
two 963


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:

def savepredictions(filename:str,prediction:list):
    # write to file:
    file=open(filename,'w',encoding='utf-8')
    try:
        for label in prediction:
            file.write(str(label)+'\n')
    finally:
        file.close()

def printWrongResults(expected:list, actuall:list,expected_df:np.array):
    train_path='./src/dataset/train.csv'
    trial_path='./src/dataset/trial.csv'
    train_dataset=pd.read_csv(train_path,index_col=None)
    trial_dataset=pd.read_csv(trial_path,index_col=None)
    w_tweets=[]
    index_tweets=[]
    for index in range(len(expected)):
        if expected[index]!=actuall[index]:
            print(train_dataset['content'].loc[index]+'Was '+str(actuall[index])+' Should be '+str(expected[index]))
            print('----------------------------\n')

def countclasses(data):
    nrofzeros=0;
    nrofones=0;
    nroftwos=0;
    for j in range(len(data)):
        if data[j]==0:
          nrofzeros+=1;
        elif data[j]==1:
          nrofones+=1;
        else:
          nroftwos+=1;

    print("zero",nrofzeros)
    print("one",nrofones)
    print("two",nroftwos)


In [26]:
if __name__ == '__main__':

    train_path='./src/dataset/train.csv'
    trial_path='./src/dataset/trial.csv'
    test_path='./src/dataset/test.csv'

    
    # preprocessing row data from train.csv. However, because of limited access of translator API,
    # Highly recommend invoking preprocessing function on train_translated.csv where store already translated tweets
    # raw_data=preprocessing(train_path,test_path,storepath='./src/dataset/train_translated.csv',translate_tag=False)

    # Or you can directly read preprocessed data from train_translated.csv in 'tokenized_content' column
    raw_data=pd.read_csv('./src/dataset/train_translated.csv')
    raw_data['tokenized_content']=raw_data['translated_tweets'].apply(tokenize_words)
    

    predic_w2v_svm=word2vec_svm(raw_data,vector_size=300)

    countclasses(predic_w2v_svm)

    # save prediction in file
#     savepredictions(
#         './src/predictions/y_pred_svm_w2c(C=0.088,gamma=1)_OVR_translated_test.txt', predic_w2v_svm)



              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00        34
         1.0       1.00      1.00      1.00        20
         2.0       0.91      1.00      0.95       346

    accuracy                           0.92       400
   macro avg       0.64      0.67      0.65       400
weighted avg       0.84      0.92      0.87       400

zero 0
one 37
two 963


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
