# Initialisation

In [175]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

OUTPUT_FOLDER = 'model/'

seed = 88
    


In [176]:
column_names = ["target", "id", "date", "flag", "user", "text"]

# Read the CSV file with specified column names
df = pd.read_csv("../dataset/training.1600000.processed.noemoticon.csv", 
                 encoding="ISO-8859-1", names=column_names)

def reduce_sample(df, frac, random_state):
    df = df.sample(frac=frac, random_state=random_state)
    df = df.reset_index(drop=True)
    return df
frac_pop = 1
df = reduce_sample(df, frac_pop, seed)

Columns in dataset

In [177]:
print("Columns in the original dataset:\n")
print(df.columns)

Columns in the original dataset:

Index(['target', 'id', 'date', 'flag', 'user', 'text'], dtype='object')


Example of an Row in dataset

In [178]:
df.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1974058893,Sat May 30 12:21:30 PDT 2009,NO_QUERY,BrookeAmanda,Ok it's only been a couple hours since dad has...
1,4,1998068077,Mon Jun 01 17:56:23 PDT 2009,NO_QUERY,KarinaKornacka,@graceofrhythm HAHA no i would never do that!!...
2,4,1999729993,Mon Jun 01 20:43:12 PDT 2009,NO_QUERY,stevegaghagen,Law of Attraction Creations: Law of Attraction...
3,4,2006627206,Tue Jun 02 11:26:46 PDT 2009,NO_QUERY,Hecie,is ordering ticketsssss EEEE (: &lt;3
4,4,1991292674,Mon Jun 01 06:46:16 PDT 2009,NO_QUERY,shanaloren,@STO_MAC nah im not mad at u....luv u too


In [179]:
df.describe()

Unnamed: 0,target,id
count,1600000.0,1600000.0
mean,2.0,1998818000.0
std,2.000001,193576100.0
min,0.0,1467810000.0
25%,0.0,1956916000.0
50%,2.0,2002102000.0
75%,4.0,2177059000.0
max,4.0,2329206000.0


Clean data

In [180]:
def clean_dataset(df, drop_columns):
    df_cleaned = df.dropna()
    df_cleaned = df_cleaned.drop_duplicates()
    df_cleaned = df_cleaned.drop(columns=drop_columns)
    df_cleaned = df_cleaned.reset_index(drop=True)
    df_cleaned.describe()
    return df_cleaned

df_cleaned = clean_dataset(df, ["date", "id", "flag", "user"])

In [181]:
df_cleaned.head()

Unnamed: 0,target,text
0,0,Ok it's only been a couple hours since dad has...
1,4,@graceofrhythm HAHA no i would never do that!!...
2,4,Law of Attraction Creations: Law of Attraction...
3,4,is ordering ticketsssss EEEE (: &lt;3
4,4,@STO_MAC nah im not mad at u....luv u too


Remove twitter tag and hashtag

In [182]:
import re
def remove_hashtag(df_cleaned):
    df_cleaned['text'] = df_cleaned['text'].apply(lambda x: re.sub(r"http\S+|@\S+|#\S+", "", x))
    return df_cleaned

df_cleaned = remove_hashtag(df_cleaned)

In [183]:
df_cleaned.head()

Unnamed: 0,target,text
0,0,Ok it's only been a couple hours since dad has...
1,4,HAHA no i would never do that!!! I actually m...
2,4,Law of Attraction Creations: Law of Attraction...
3,4,is ordering ticketsssss EEEE (: &lt;3
4,4,nah im not mad at u....luv u too


convert target back to -1 0 1


In [184]:
def convert_to_unitary_target(df_cleaned, target_column):
    df_cleaned[target_column] = df_cleaned[target_column].map({0: -1, 2: 0, 4: 1})
    return df_cleaned

df_cleaned = convert_to_unitary_target(df_cleaned, 'target')
df_cleaned['target'].value_counts()

target
-1    800000
 1    800000
Name: count, dtype: int64

Tokenisation

In [185]:
def tokenize_text(df_cleaned, text_column, tokenized_text_column):
    from gensim.utils import simple_preprocess
    # Tokenize the text column to get the new column 'tokenized_text'
    df_cleaned[tokenized_text_column] = [simple_preprocess(line, deacc=True) for line in df_cleaned[text_column]]
    return df_cleaned
    
df_cleaned = tokenize_text(df_cleaned, 'text', 'tokenized_text')
df_cleaned['tokenized_text'].head()

0    [ok, it, only, been, couple, hours, since, dad...
1    [haha, no, would, never, do, that, actually, m...
2    [law, of, attraction, creations, law, of, attr...
3                [is, ordering, ticketsssss, eeee, lt]
4                    [nah, im, not, mad, at, luv, too]
Name: tokenized_text, dtype: object

# Stemming & Lemma

In [186]:
df_to_be_stemmed = df_cleaned.copy()

### PoterStammer

In [187]:
def porter_stemmer_on_text(df_to_be_stemmed, token_text_column, stemmed_text_column):
    from gensim.parsing.porter import PorterStemmer
    porter_stemmer = PorterStemmer()
    df_potter_stemmed = df_to_be_stemmed.copy()
    # Get the stemmed_tokens
    df_potter_stemmed[stemmed_text_column] = [[porter_stemmer.stem(word) for word in tokens] 
                                        for tokens in df_potter_stemmed[token_text_column]]  
    return df_potter_stemmed



### Lancaster

In [188]:
def lancaster_stemmer_on_text(df_to_be_stemmed, token_text_column, stemmed_text_column):
    from nltk.stem.lancaster import LancasterStemmer
    lancaster_stemmer = LancasterStemmer()
    df_lancaster_stemmed = df_to_be_stemmed.copy()
    # Get the stemmed_tokens
    df_lancaster_stemmed[stemmed_text_column] = [[lancaster_stemmer.stem(word) for word in tokens] 
                                        for tokens in df_lancaster_stemmed[token_text_column]]
    
    return df_lancaster_stemmed


### Snowball

In [189]:
def snowball_stemmer_on_text(df_to_be_stemmed, token_text_column, stemmed_text_column):
    from nltk.stem.snowball import EnglishStemmer
    snowball_stemmer = EnglishStemmer()
    df_snowball_stemmed = df_to_be_stemmed.copy()
    # Get the stemmed_tokens
    df_snowball_stemmed[stemmed_text_column] = [[snowball_stemmer.stem(word) for word in tokens] 
                                        for tokens in df_snowball_stemmed[token_text_column]]
    
    return df_snowball_stemmed


### Lemmatisation

In [190]:
def lemmatize_text(df_to_be_stemmed, token_text_column, lemmatized_text_column):
    from nltk.stem import WordNetLemmatizer
    wordnet_lemmatizer = WordNetLemmatizer()
    df_lemmatized = df_to_be_stemmed.copy()
    
    # Get the lemmatized_tokens
    df_lemmatized[lemmatized_text_column] = [[wordnet_lemmatizer.lemmatize(word) for word in tokens] 
                                          for tokens in df_lemmatized[token_text_column]]


In [191]:
df_potter_stemmed = porter_stemmer_on_text(df_to_be_stemmed, 'tokenized_text', 'stemmed_text')
df_potter_stemmed['stemmed_text'].head(10)    

0    [ok, it, onli, been, coupl, hour, sinc, dad, h...
1    [haha, no, would, never, do, that, actual, mad...
2    [law, of, attract, creation, law, of, attract,...
3                   [is, order, ticketsssss, eeee, lt]
4                    [nah, im, not, mad, at, luv, too]
5    [centuri, room, tast, the, rainbow, with, your...
6    [ari, also, got, servic, award, for, the, comm...
7    [man, thought, somethin, wa, fina, go, done, s...
8           [leav, moscow, when, it, final, get, warm]
9                           [wish, got, summer, break]
Name: stemmed_text, dtype: object

## Split into Train and Test Sets

- Train data ( Subset of data for training ML Model) ~70%
- Test data (Subset of data for testing ML Model trained from the train data)

In [192]:
from sklearn.model_selection import train_test_split

def split_train_test(data, sentiment_value_col, tokenised_text_col, test_size=0.3, shuffle_state=True):
    X_train, X_test, Y_train, Y_test = train_test_split( data[tokenised_text_col],
                                                        data[sentiment_value_col], 
                                                        shuffle=shuffle_state,
                                                        test_size=test_size, 
                                                        random_state=15)
    print("Value counts for Train sentiments")
    print(Y_train.value_counts())
    print("Value counts for Test sentiments")
    print(Y_test.value_counts())
    print(type(X_train))
    print(type(Y_train))
    X_train = X_train.reset_index()
    X_test = X_test.reset_index()
    Y_train = Y_train.to_frame()
    Y_train = Y_train.reset_index()
    Y_test = Y_test.to_frame()
    Y_test = Y_test.reset_index()
    print(X_train.head())
    return X_train, X_test, Y_train, Y_test

X_train, X_test, Y_train, Y_test = split_train_test(df_potter_stemmed, 'target', 'stemmed_text')

Value counts for Train sentiments
target
 1    560206
-1    559794
Name: count, dtype: int64
Value counts for Test sentiments
target
-1    240206
 1    239794
Name: count, dtype: int64
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
     index                                       stemmed_text
0  1448643  [hiya, davina, think, you, re, great, soooo, l...
1  1423081  [tadi, se, neda, pracovat, sous, nade, mnou, d...
2  1598349  [photo, eatsleepdraw, iti, ½ll, be, sick, for,...
3   405940  [to, top, that, off, my, tricep, ar, kill, me,...
4  1050615  [saw, box, full, of, star, war, miniatur, toda...


# Word2Vec 

## Save-gram approach

### Generate model

vector_size (int, optional) – Dimensionality of the word vectors.

window (int, optional) – Maximum distance between the current and predicted word within a sentence.

min_count (int, optional) – Ignores all words with total frequency lower than this.

workers (int, optional) – Use these many worker threads to train the model (=faster training with multicore machines).

sg ({0, 1}, optional) – Training algorithm: 1 for skip-gram; otherwise CBOW.

In [193]:
def generate_word2vec_model(stemmed_df, file_name_code, stem_column_name, sg, vector_size, window, min_count, workers):
    from gensim.models import Word2Vec
    # Skip-gram model (sg = 1)
    filename = f"{file_name_code}_wind_{window}_min_{min_count}_workers_{workers}.wordvectors"
    stemmed_tokens = pd.Series(stemmed_df[stem_column_name]).values
    # Train the Word2Vec Model
    w2v_model = Word2Vec(stemmed_tokens, min_count = min_count, vector_size = vector_size, workers = workers, window = window, sg = sg, cbow_mean = 1)
    w2v_model_wv = w2v_model.wv
    w2v_model_wv.save(OUTPUT_FOLDER + filename)

    return w2v_model_wv, OUTPUT_FOLDER + filename

vector_size = 100
sg = 0
file_name_code = f"vec_sz_{vector_size}_sg_{sg}_frac_pop_{frac_pop}"
OUTPUT_FOLDER = 'model/' + file_name_code + '/'
import os
if not os.path.exists(OUTPUT_FOLDER):
    os.makedirs(OUTPUT_FOLDER)

w2v_model_wv, output_name = generate_word2vec_model(df_potter_stemmed, file_name_code,'stemmed_text', sg = sg, vector_size=vector_size, min_count=1, window=8, workers=100)

# print(w2v_model_cbow_wv.most_similar('good'))
# print(w2v_model_sg_wv.most_similar('good'))


### Load model

In [194]:
import numpy as np
from gensim.models import KeyedVectors
# Load the model from the model file

def load_word2vec_model(reduced_mode_file, stem_column_name):
    sg_w2v_model_wv = KeyedVectors.load(reduced_mode_file)
    # # Unique ID of the word
    # print("Index of the word 'action':")
    # print(sg_w2v_model_wv.key_to_index["action"])
    # # Total number of the words
    # print(len(sg_w2v_model_wv.key_to_index))
    # # Print the size of the word2vec vector for one word
    # print("Length of the vector generated for a word")
    # print(len(sg_w2v_model_wv['action']))
    # # Get the mean for the vectors for an example review
    # print("Print the length after taking average of all word vectors in a sentence:")
    # print(np.mean([sg_w2v_model_wv[token] for token in df_potter_stemmed[stem_column_name][0]], axis=0))
    return sg_w2v_model_wv
    
w2v_model_wv = load_word2vec_model(output_name, 'stemmed_text')

In [195]:
def save_word2vec_to_csv(X_set, sg_w2v_model_wv, stem_col, word2vec_filename):
    with open(word2vec_filename, 'w+') as word2vec_file:
        for index, row in X_set.iterrows():
            v_norm = (np.mean([sg_w2v_model_wv[token] for token in row[stem_col]], axis=0)).tolist()
            if index == 0:
                header = ",".join(str(ele) for ele in range(vector_size))
                word2vec_file.write(header)
                word2vec_file.write("\n")
            # Check if the line exists else it is vector of zeros
            if type(v_norm) is list:  
                line1 = ",".join( [str(vector_element) for vector_element in v_norm] )
            else:
                line1 = ",".join([str(0) for i in range(vector_size)])
            word2vec_file.write(line1)
            word2vec_file.write('\n')
           

    
train_X_word2vec_filename = OUTPUT_FOLDER +  f"train_X_{file_name_code}.csv"
test_X_word2vec_filename = OUTPUT_FOLDER + f"test_X_{file_name_code}.csv"


save_word2vec_to_csv(X_train, w2v_model_wv, "stemmed_text", train_X_word2vec_filename)
save_word2vec_to_csv(X_test, w2v_model_wv, "stemmed_text", test_X_word2vec_filename)

# save_word2vec_to_csv(X_train, w2v_model_cbow_wv, "stemmed_text", train_X_word2vec_cbow_filename)
# save_word2vec_to_csv(X_test, w2v_model_cbow_wv, "stemmed_text", test_X_word2vec_cbow_filename)


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


## Load Training and Testing Set

In [196]:
def load_train_w2v_from_csv(word2vec_filename):
    train_word2vec_df = pd.read_csv(word2vec_filename)
    return train_word2vec_df

def load_test_wv_w2v_from_csv(test_X_word2vec_filename):
    return pd.read_csv(test_X_word2vec_filename)

def generate_X_w2v_df(X_set, w2v_model_wv, stem_column_name):
    X_wv = []
    for index, row in X_set.iterrows():
        model_vector = (np.mean([w2v_model_wv[token] for token in row[stem_column_name]], axis=0))
        if model_vector.shape == () :
            model_vector = np.zeros(vector_size)
        X_wv.append(model_vector.reshape(1, -1))
    return pd.DataFrame(np.concatenate(X_wv, axis=0))

X_train_wv = load_train_w2v_from_csv(train_X_word2vec_filename)
X_test_wv = load_test_wv_w2v_from_csv(test_X_word2vec_filename)

# X_train_wv = generate_X_w2v_df(X_train, w2v_model_wv, "stemmed_text")
# X_test_wv = generate_X_w2v_df(X_test, w2v_model_wv, "stemmed_text")

# Training Model

## Decision Tree Classifier

In [197]:
def train_decision_tree_word2vec(X_train_wv, Y_train, file_name_code):
    from sklearn.tree import DecisionTreeClassifier
    #Initialize the model
    clf_decision_word2vec = DecisionTreeClassifier()
    # Fit the model
    clf_decision_word2vec.fit(X_train_wv, Y_train['target'])

    import joblib
    joblib.dump(clf_decision_word2vec, OUTPUT_FOLDER + f'clf_dt_cbow_{file_name_code}.pkl')

    return clf_decision_word2vec

clf_decision_word2vec = train_decision_tree_word2vec(X_train_wv, Y_train, file_name_code)

Testing the model

In [198]:
def test_decision_tree_word2vec(Y_test, X_test_wv, clf):
    from sklearn.metrics import classification_report
    test_predictions_word2vec = clf.predict(X_test_wv)

    print(classification_report(Y_test['target'], test_predictions_word2vec))

test_decision_tree_word2vec(Y_test, X_test_wv, clf_decision_word2vec)

              precision    recall  f1-score   support

          -1       0.65      0.65      0.65    240206
           1       0.65      0.65      0.65    239794

    accuracy                           0.65    480000
   macro avg       0.65      0.65      0.65    480000
weighted avg       0.65      0.65      0.65    480000



## SVM

### Standard

In [199]:
def train_linear_svc(X_train_wv, Y_train, file_name_code):
    from sklearn.svm import LinearSVC
    #Initialize the model
    clf_decision_word2vec = LinearSVC()
    # Fit the model
    clf_decision_word2vec.fit(X_train_wv, Y_train['target'])

    import joblib
    joblib.dump(clf_decision_word2vec, OUTPUT_FOLDER + f'l_svc_{file_name_code}.pkl')

    return clf_decision_word2vec

svc_clf = train_linear_svc(X_train_wv, Y_train, file_name_code)



In [200]:
def test_linear_svc(Y_test, X_test_wv, clf):
    from sklearn.metrics import classification_report
    test_predictions_word2vec = clf.predict(X_test_wv)

    print(classification_report(Y_test['target'], test_predictions_word2vec))

test_linear_svc(Y_test, X_test_wv, svc_clf)

              precision    recall  f1-score   support

          -1       0.76      0.76      0.76    240206
           1       0.76      0.76      0.76    239794

    accuracy                           0.76    480000
   macro avg       0.76      0.76      0.76    480000
weighted avg       0.76      0.76      0.76    480000



## RandomForestClassifier

In [201]:
def train_random_forest_clf(X_train_wv, Y_train, file_name_code):
    from sklearn.ensemble import RandomForestClassifier
    #Initialize the model
    clf_decision_word2vec = RandomForestClassifier()
    # Fit the model
    clf_decision_word2vec.fit(X_train_wv, Y_train['target'])

    import joblib
    joblib.dump(clf_decision_word2vec, OUTPUT_FOLDER + f'random_forest_dt_clf_{file_name_code}.pkl')

    return clf_decision_word2vec

clf_rfdt = train_random_forest_clf(X_train_wv, Y_train, file_name_code)

In [202]:
def test_random_forest_clf(Y_test, X_test_wv, clf):
    from sklearn.metrics import classification_report
    # from joblib import load
    # clf = load(OUTPUT_FOLDER + 'svm_classifier_scl_linear.pkl')
    test_predictions_word2vec_svm_scaled = clf.predict(X_test_wv)

    print(classification_report(Y_test['target'], test_predictions_word2vec_svm_scaled))

test_random_forest_clf(Y_test, X_test_wv, clf_rfdt)

              precision    recall  f1-score   support

          -1       0.76      0.77      0.76    240206
           1       0.77      0.75      0.76    239794

    accuracy                           0.76    480000
   macro avg       0.76      0.76      0.76    480000
weighted avg       0.76      0.76      0.76    480000



## MultinomialNB

In [204]:
def train_naive_bay(X_train_wv, Y_train, file_name_code):
    from sklearn.naive_bayes import GaussianNB
    #Initialize the model
    clf_decision_word2vec = GaussianNB()
    # Fit the model
    clf_decision_word2vec.fit(X_train_wv, Y_train['target'])

    import joblib
    joblib.dump(clf_decision_word2vec, OUTPUT_FOLDER + f'gauss_NB_{file_name_code}.pkl')

    return clf_decision_word2vec

svc_clf = train_naive_bay(X_train_wv, Y_train, file_name_code)

In [205]:
def test_naive_bay(Y_test, X_test_wv, clf):
    from sklearn.metrics import classification_report
    test_predictions_word2vec = clf.predict(X_test_wv)

    print(classification_report(Y_test['target'], test_predictions_word2vec))

test_naive_bay(Y_test, X_test_wv, svc_clf)

              precision    recall  f1-score   support

          -1       0.62      0.81      0.70    240206
           1       0.73      0.50      0.60    239794

    accuracy                           0.66    480000
   macro avg       0.68      0.66      0.65    480000
weighted avg       0.68      0.66      0.65    480000



# Result


## Trial 1 26/3/2024


- vector size = 100
- sg = 1
- frac_pop = 1

W2V size = 90KB

|Model name | Setting | F1 | Accuracy | Size|
|-----------|----------|----|----------|------|
|Decision tree | DEFAULT | 0.66 | 0.66| 19,000 KB|
|Linear SVC | DEFAULT | 0.76 | 0.76 | 3KB |
|Random Forest | DEFAULT | 0.76 | 0.76 | 1.9e6 KB|
|Gauss NB| DEFAULT | 0.70\0.60 | 0.66 | 6KB |
