# Initialisation

In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

OUTPUT_FOLDER = '../model/'

seed = 88
    


In [31]:
column_names = ["target", "id", "date", "flag", "user", "text"]

# Read the CSV file with specified column names
df = pd.read_csv("../dataset/training.1600000.processed.noemoticon.csv", 
                 encoding="ISO-8859-1", names=column_names)

def reduce_sample(df, frac, random_state):
    df = df.sample(frac=frac, random_state=random_state)
    df = df.reset_index(drop=True)
    return df

# df = reduce_sample(df, 0.3, seed)

Columns in dataset

In [32]:
print("Columns in the original dataset:\n")
print(df.columns)

Columns in the original dataset:

Index(['target', 'id', 'date', 'flag', 'user', 'text'], dtype='object')


Example of an Row in dataset

In [33]:
df.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1974058893,Sat May 30 12:21:30 PDT 2009,NO_QUERY,BrookeAmanda,Ok it's only been a couple hours since dad has...
1,4,1998068077,Mon Jun 01 17:56:23 PDT 2009,NO_QUERY,KarinaKornacka,@graceofrhythm HAHA no i would never do that!!...
2,4,1999729993,Mon Jun 01 20:43:12 PDT 2009,NO_QUERY,stevegaghagen,Law of Attraction Creations: Law of Attraction...
3,4,2006627206,Tue Jun 02 11:26:46 PDT 2009,NO_QUERY,Hecie,is ordering ticketsssss EEEE (: &lt;3
4,4,1991292674,Mon Jun 01 06:46:16 PDT 2009,NO_QUERY,shanaloren,@STO_MAC nah im not mad at u....luv u too


In [34]:
df.describe()

Unnamed: 0,target,id
count,480000.0,480000.0
mean,1.997267,1999221000.0
std,2.0,193630000.0
min,0.0,1467810000.0
25%,0.0,1956973000.0
50%,0.0,2002237000.0
75%,4.0,2177253000.0
max,4.0,2329205000.0


Clean data

In [35]:
def clean_dataset(df, drop_columns):
    df_cleaned = df.dropna()
    df_cleaned = df_cleaned.drop_duplicates()
    df_cleaned = df_cleaned.drop(columns=drop_columns)
    df_cleaned = df_cleaned.reset_index(drop=True)
    df_cleaned.describe()
    return df_cleaned

df_cleaned = clean_dataset(df, ["date", "id", "flag", "user"])

In [36]:
df_cleaned.head()

Unnamed: 0,target,text
0,0,Ok it's only been a couple hours since dad has...
1,4,@graceofrhythm HAHA no i would never do that!!...
2,4,Law of Attraction Creations: Law of Attraction...
3,4,is ordering ticketsssss EEEE (: &lt;3
4,4,@STO_MAC nah im not mad at u....luv u too


Remove twitter tag and hashtag

In [37]:
import re
def remove_hashtag(df_cleaned):
    df_cleaned['text'] = df_cleaned['text'].apply(lambda x: re.sub(r"http\S+|@\S+|#\S+", "", x))
    return df_cleaned

df_cleaned = remove_hashtag(df_cleaned)

In [38]:
df_cleaned.head()

Unnamed: 0,target,text
0,0,Ok it's only been a couple hours since dad has...
1,4,HAHA no i would never do that!!! I actually m...
2,4,Law of Attraction Creations: Law of Attraction...
3,4,is ordering ticketsssss EEEE (: &lt;3
4,4,nah im not mad at u....luv u too


convert target back to -1 0 1


In [39]:
def convert_to_unitary_target(df_cleaned, target_column):
    df_cleaned[target_column] = df_cleaned[target_column].map({0: -1, 2: 0, 4: 1})
    return df_cleaned

df_cleaned = convert_to_unitary_target(df_cleaned, 'target')
df_cleaned['target'].value_counts()

target
-1    240328
 1    239672
Name: count, dtype: int64

Tokenisation

In [40]:
def tokenize_text(df_cleaned, text_column, tokenized_text_column):
    from gensim.utils import simple_preprocess
    # Tokenize the text column to get the new column 'tokenized_text'
    df_cleaned[tokenized_text_column] = [simple_preprocess(line, deacc=True) for line in df_cleaned[text_column]]
    return df_cleaned
    
df_cleaned = tokenize_text(df_cleaned, 'text', 'tokenized_text')
df_cleaned['tokenized_text'].head()

0    [ok, it, only, been, couple, hours, since, dad...
1    [haha, no, would, never, do, that, actually, m...
2    [law, of, attraction, creations, law, of, attr...
3                [is, ordering, ticketsssss, eeee, lt]
4                    [nah, im, not, mad, at, luv, too]
Name: tokenized_text, dtype: object

# Stemming & Lemma

In [41]:
df_to_be_stemmed = df_cleaned.copy()

### PoterStammer

In [42]:
def porter_stemmer_on_text(df_to_be_stemmed, token_text_column, stemmed_text_column):
    from gensim.parsing.porter import PorterStemmer
    porter_stemmer = PorterStemmer()
    df_potter_stemmed = df_to_be_stemmed.copy()
    # Get the stemmed_tokens
    df_potter_stemmed[stemmed_text_column] = [[porter_stemmer.stem(word) for word in tokens] 
                                        for tokens in df_potter_stemmed[token_text_column]]  
    return df_potter_stemmed



### Lancaster

In [43]:
def lancaster_stemmer_on_text(df_to_be_stemmed, token_text_column, stemmed_text_column):
    from nltk.stem.lancaster import LancasterStemmer
    lancaster_stemmer = LancasterStemmer()
    df_lancaster_stemmed = df_to_be_stemmed.copy()
    # Get the stemmed_tokens
    df_lancaster_stemmed[stemmed_text_column] = [[lancaster_stemmer.stem(word) for word in tokens] 
                                        for tokens in df_lancaster_stemmed[token_text_column]]
    
    return df_lancaster_stemmed


### Snowball

In [44]:
def snowball_stemmer_on_text(df_to_be_stemmed, token_text_column, stemmed_text_column):
    from nltk.stem.snowball import EnglishStemmer
    snowball_stemmer = EnglishStemmer()
    df_snowball_stemmed = df_to_be_stemmed.copy()
    # Get the stemmed_tokens
    df_snowball_stemmed[stemmed_text_column] = [[snowball_stemmer.stem(word) for word in tokens] 
                                        for tokens in df_snowball_stemmed[token_text_column]]
    
    return df_snowball_stemmed


### Lemmatisation

In [45]:
def lemmatize_text(df_to_be_stemmed, token_text_column, lemmatized_text_column):
    from nltk.stem import WordNetLemmatizer
    wordnet_lemmatizer = WordNetLemmatizer()
    df_lemmatized = df_to_be_stemmed.copy()
    
    # Get the lemmatized_tokens
    df_lemmatized[lemmatized_text_column] = [[wordnet_lemmatizer.lemmatize(word) for word in tokens] 
                                          for tokens in df_lemmatized[token_text_column]]


In [46]:
df_potter_stemmed = porter_stemmer_on_text(df_to_be_stemmed, 'tokenized_text', 'stemmed_text')
df_potter_stemmed['stemmed_text'].head(10)    

0    [ok, it, onli, been, coupl, hour, sinc, dad, h...
1    [haha, no, would, never, do, that, actual, mad...
2    [law, of, attract, creation, law, of, attract,...
3                   [is, order, ticketsssss, eeee, lt]
4                    [nah, im, not, mad, at, luv, too]
5    [centuri, room, tast, the, rainbow, with, your...
6    [ari, also, got, servic, award, for, the, comm...
7    [man, thought, somethin, wa, fina, go, done, s...
8           [leav, moscow, when, it, final, get, warm]
9                           [wish, got, summer, break]
Name: stemmed_text, dtype: object

## Split into Train and Test Sets

- Train data ( Subset of data for training ML Model) ~70%
- Test data (Subset of data for testing ML Model trained from the train data)

In [47]:
from sklearn.model_selection import train_test_split

def split_train_test(data, sentiment_value_col, tokenised_text_col, test_size=0.3, shuffle_state=True):
    X_train, X_test, Y_train, Y_test = train_test_split( data[tokenised_text_col],
                                                        data[sentiment_value_col], 
                                                        shuffle=shuffle_state,
                                                        test_size=test_size, 
                                                        random_state=15)
    print("Value counts for Train sentiments")
    print(Y_train.value_counts())
    print("Value counts for Test sentiments")
    print(Y_test.value_counts())
    print(type(X_train))
    print(type(Y_train))
    X_train = X_train.reset_index()
    X_test = X_test.reset_index()
    Y_train = Y_train.to_frame()
    Y_train = Y_train.reset_index()
    Y_test = Y_test.to_frame()
    Y_test = Y_test.reset_index()
    print(X_train.head())
    return X_train, X_test, Y_train, Y_test

X_train, X_test, Y_train, Y_test = split_train_test(df_potter_stemmed, 'target', 'stemmed_text')

Value counts for Train sentiments
target
-1    168133
 1    167867
Name: count, dtype: int64
Value counts for Test sentiments
target
-1    72195
 1    71805
Name: count, dtype: int64
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
    index                                       stemmed_text
0  266527  [lil, is, move, to, alabama, at, the, end, of,...
1  401839  [hangov, and, unfortun, there, lot, of, work, ...
2  218733                 [damn, won, thi, round, homi, lol]
3  435715       [dont, you, worri, hear, the, tequila, call]
4  203570  [wish, could, join, you, on, fridai, but, on, ...


# Word2Vec 

## Save-gram approach

### Generate model

vector_size (int, optional) – Dimensionality of the word vectors.

window (int, optional) – Maximum distance between the current and predicted word within a sentence.

min_count (int, optional) – Ignores all words with total frequency lower than this.

workers (int, optional) – Use these many worker threads to train the model (=faster training with multicore machines).

sg ({0, 1}, optional) – Training algorithm: 1 for skip-gram; otherwise CBOW.

In [48]:
def generate_word2vec_model(stemmed_df, filename, stem_column_name, sg, vector_size, window, min_count, workers):
    from gensim.models import Word2Vec
    # Skip-gram model (sg = 1)
    filename = f"{filename}_sg_{sg}_vec_{vector_size}_wind_{window}_min_{min_count}_workers_{workers}.wordvectors"
    stemmed_tokens = pd.Series(stemmed_df[stem_column_name]).values
    # Train the Word2Vec Model
    w2v_model = Word2Vec(stemmed_tokens, min_count = min_count, vector_size = vector_size, workers = workers, window = window, sg = sg, cbow_mean = 1)
    w2v_model_wv = w2v_model.wv
    w2v_model_wv.save(OUTPUT_FOLDER + filename)
    
    return w2v_model_wv

w2v_model = generate_word2vec_model(df_potter_stemmed, 'stemmed_text', 1, vector_size=100, min_count=5, window=8, workers=4)

# print(w2v_model_cbow_wv.most_similar('good'))
# print(w2v_model_sg_wv.most_similar('good'))




### Load model

In [50]:
import numpy as np
from gensim.models import Word2Vec
# Load the model from the model file
vector_size = 1000
window = 3
min_count = 1
workers = 3
sg = 1

def load_word2vec_model(reduced_mode_file, stem_column_name):
    sg_w2v_model = Word2Vec.load(reduced_mode_file)
    sg_w2v_model_wv = sg_w2v_model.wv
    # Unique ID of the word
    print("Index of the word 'action':")
    print(sg_w2v_model_wv.key_to_index["action"])
    # Total number of the words 
    print(len(sg_w2v_model_wv.key_to_index))
    # Print the size of the word2vec vector for one word
    print("Length of the vector generated for a word")
    print(len(sg_w2v_model_wv['action']))
    # Get the mean for the vectors for an example review
    print("Print the length after taking average of all word vectors in a sentence:")
    print(np.mean([sg_w2v_model_wv[token] for token in df_potter_stemmed[stem_column_name][0]], axis=0))
    return sg_w2v_model
    
w2v_model_cbow = load_word2vec_model("../model/word2vec_CBOW_0.model", 'stemmed_text')
w2v_model_sg = load_word2vec_model("../model/word2vec_skipgram_1.model", 'stemmed_text')

w2v_model_cbow_wv = w2v_model_cbow.wv
w2v_model_sg_wv = w2v_model_sg.wv

Index of the word 'action':
1736
101855
Length of the vector generated for a word
1000
Print the length after taking average of all word vectors in a sentence:
[ 1.70858055e-01  1.97836414e-01  2.20841214e-01 -1.56638101e-01
  1.15740329e-01  1.78239465e-01  1.63478181e-01 -2.10290596e-01
  2.37063989e-01  2.74408069e-02  8.22715312e-02  1.82533324e-01
 -7.50373006e-02  1.92195401e-01  3.21462095e-01 -1.68181788e-02
 -1.83908254e-01 -1.38724118e-01 -1.45018492e-02 -4.66892160e-02
  3.02184135e-01 -2.11970225e-01  1.44068122e-01  2.88798697e-02
  5.72037339e-01 -2.13407412e-01  3.03132057e-01 -1.50991410e-01
 -1.27851339e-02  2.01229692e-01  2.16573384e-02 -1.56425387e-01
 -4.42061037e-01  1.09895207e-01  4.22592193e-01  1.03549778e-01
 -2.43627965e-01 -1.02742404e-01  1.84394349e-03 -8.70813578e-02
  9.65657979e-02  1.42059073e-01 -2.71252453e-01  1.10590853e-01
  2.52261162e-01  6.30435720e-02 -2.10903630e-01 -2.38582436e-02
  7.99152348e-03 -8.79225209e-02  2.17665181e-01 -8.27851892

In [51]:
def save_word2vec_to_csv(X_set, sg_w2v_model_wv, stem_col, word2vec_filename):
    with open(word2vec_filename, 'w+') as word2vec_file:
        for index, row in X_set.iterrows():
            model_vector = (np.mean([sg_w2v_model_wv[token] for token in row[stem_col]], axis=0))
            v_norm = model_vector / (np.linalg.norm(model_vector) + 1e-16)
            v_norm = v_norm.tolist()
            if index == 0:
                header = ",".join(str(ele) for ele in range(1000))
                word2vec_file.write(header)
                word2vec_file.write("\n")
            # Check if the line exists else it is vector of zeros
            if type(v_norm) is list:  
                line1 = ",".join( [str(vector_element) for vector_element in v_norm] )
            else:
                line1 = ",".join([str(0) for i in range(1000)])
            word2vec_file.write(line1)
            word2vec_file.write('\n')
           
def save_word2vec_to_csv_round(X_set, sg_w2v_model_wv, stem_col, word2vec_filename):
    with open(word2vec_filename, 'w+') as word2vec_file:
        for index, row in X_set.iterrows():
            model_vector = (np.mean([sg_w2v_model_wv[token] for token in row[stem_col]], axis=0))
            v_norm = model_vector / (np.linalg.norm(model_vector) + 1e-16)
            v_norm = np.round(v_norm, decimals = 6 ).tolist()
            if index == 0:
                header = ",".join(str(ele) for ele in range(1000))
                word2vec_file.write(header)
                word2vec_file.write("\n")
            # Check if the line exists else it is vector of zeros
            if type(v_norm) is list:  
                line1 = ",".join( [str(vector_element) for vector_element in v_norm] )
            else:
                line1 = ",".join([str(0) for i in range(1000)])
            word2vec_file.write(line1)
            word2vec_file.write('\n') 
    
train_X_word2vec_sg_filename = "../model/train_X_0.3_wv.csv"
test_X_word2vec_sg_filename = "../model/test_X_0.3_wv.csv"

train_X_word2vec_cbow_filename = "../model/train_X_0.3_wv.csv"
test_X_word2vec_cbow_filename = "../model/test_X_0.3_wv.csv"

# save_word2vec_to_csv(X_train, w2v_model_sg_wv, "stemmed_text", train_X_word2vec_sg_filename)
# save_word2vec_to_csv(X_test, w2v_model_sg_wv, "stemmed_text", test_X_word2vec_sg_filename)

# save_word2vec_to_csv(X_train, w2v_model_cbow_wv, "stemmed_text", train_X_word2vec_cbow_filename)
# save_word2vec_to_csv(X_test, w2v_model_cbow_wv, "stemmed_text", test_X_word2vec_cbow_filename)


# Training Model

## Load Training and Testing Set

In [52]:
def load_train_w2v_from_csv(word2vec_filename):
    train_word2vec_df = pd.read_csv(word2vec_filename)
    return train_word2vec_df

def load_test_wv_w2v_from_csv(test_X_word2vec_filename):
    return pd.read_csv(test_X_word2vec_filename)

X_train_wv = load_train_w2v_from_csv(train_X_word2vec_cbow_filename)
X_test_wv = load_test_wv_w2v_from_csv(test_X_word2vec_cbow_filename)


## Decision Tree Classifier

In [53]:
def train_decision_tree_word2vec(X_train_wv, Y_train):
    from sklearn.tree import DecisionTreeClassifier
    #Initialize the model
    clf_decision_word2vec = DecisionTreeClassifier()
    # Fit the model
    clf_decision_word2vec.fit(X_train_wv, Y_train['target'])
    
    import joblib
    joblib.dump(clf_decision_word2vec, OUTPUT_FOLDER + 'clf_dt_cbow.pkl')
    
    return clf_decision_word2vec

train_decision_tree_word2vec(X_train_wv, Y_train)

Testing the model

In [54]:
def test_decision_tree_word2vec(Y_test, X_test_wv):
    from sklearn.metrics import classification_report
    from joblib import load
    # Load the model from the file
    clf_decision_word2vec = load("../model/clf_dt_cbow.pkl")
    test_predictions_word2vec = clf_decision_word2vec.predict(X_test_wv)

    print(classification_report(Y_test['target'], test_predictions_word2vec))
    
test_decision_tree_word2vec(Y_test, X_test_wv)

              precision    recall  f1-score   support

          -1       0.64      0.65      0.64     72195
           1       0.64      0.64      0.64     71805

    accuracy                           0.64    144000
   macro avg       0.64      0.64      0.64    144000
weighted avg       0.64      0.64      0.64    144000



## SGDClassifier

In [55]:
def train_sgd_clf(Y_train):
    from sklearn.linear_model import Perceptron
    #Initialize the model
    clf = Perceptron()
    batch = 100000
    classes = [-1, 1]
    for i in range(0, len(Y_train), batch):
        print("Training the model with batch size of 100000 of batch ", i)
        X_train_wv = pd.read_csv(train_X_word2vec_filename, skiprows=i, nrows=batch, header=1)
        train_target = Y_train['target'][i:i+batch]
        # Fit the model
        clf.partial_fit(X_train_wv, train_target, classes = classes)
    
    import joblib
    joblib.dump(clf, OUTPUT_FOLDER + 'sgd_T160_full_batched.pkl')
    
    return clf



## SVM

### Standard

In [56]:
def train_svc_word2vec(X_train_wv, Y_train):
    from sklearn.svm import SVC
    #Initialize the model
    svm_classifier = SVC()

    # Fit the model
    svm_classifier.fit(X_train_wv, Y_train['target'])
    
    import joblib
    joblib.dump(svm_classifier, OUTPUT_FOLDER + 'svm_classifier.pkl')
    
    return svm_classifier

In [57]:
def test_svc_word2vec(Y_test, X_test_wv):
    from sklearn.metrics import classification_report
    from joblib import load
    svm_classifier = load(OUTPUT_FOLDER + 'svm_classifier.pkl')
    reduced_test_features_word2vec = pd.DataFrame(X_test_wv).sample(frac=0.1, random_state=42)
    test_predictions_word2vec_svm = svm_classifier.predict(reduced_test_features_word2vec)
    print(len(test_predictions_word2vec_svm))
    
    print(classification_report(pd.Series(Y_test['target']).sample(frac=0.1, random_state=42),test_predictions_word2vec_svm))
    


### with standard scaler

Do not be a retard: low C == less rigid: more spaces for error: train faster

C = 1.0 take 4 hours

C = 2.0 takes more than 6hours before terminated


In [58]:
def train_svc_word2vec_general(X_train_wv, Y_train, C, kernel):
    from sklearn.svm import SVC
    from sklearn.pipeline import make_pipeline
    from sklearn.preprocessing import StandardScaler
    #Initialize the model
    clf = make_pipeline(StandardScaler(), SVC(gamma='auto', kernel = kernel, C = C))
    # Fit the model
    clf.fit(X_train_wv, Y_train['target'])
    import joblib
    joblib.dump(clf, OUTPUT_FOLDER + 'svm_classifier_C0.2_linear.pkl')
    
    return clf

# clf = train_svc_word2vec_general(X_train_wv, Y_train, 0.2, 'linear')

In [59]:
def test_svc_word2vec_general(Y_test, X_test_wv, clf):
    from sklearn.metrics import classification_report
    # from joblib import load
    # clf = load(OUTPUT_FOLDER + 'svm_classifier_scl_linear.pkl')
    test_predictions_word2vec_svm_scaled = clf.predict(X_test_wv)

    print(classification_report(Y_test['target'], test_predictions_word2vec_svm_scaled))

# test_svc_word2vec_general(Y_test, X_test_wv, clf)

## RandomForestClassifier

In [60]:
def train_random_forest_clf(X_train_wv, Y_train):
    from sklearn.ensemble import RandomForestClassifier
    #Initialize the model
    clf_decision_word2vec = RandomForestClassifier()
    # Fit the model
    clf_decision_word2vec.fit(X_train_wv, Y_train['target'])
    
    import joblib
    joblib.dump(clf_decision_word2vec, OUTPUT_FOLDER + 'random_forest_dt_clf.pkl')
    
    return clf_decision_word2vec

clf_rfdt = train_random_forest_clf(X_train_wv, Y_train)

In [None]:
def test_random_forest_clf(Y_test, X_test_wv, clf):
    from sklearn.metrics import classification_report
    # from joblib import load
    # clf = load(OUTPUT_FOLDER + 'svm_classifier_scl_linear.pkl')
    test_predictions_word2vec_svm_scaled = clf.predict(X_test_wv)

    print(classification_report(Y_test['target'], test_predictions_word2vec_svm_scaled))

test_random_forest_clf(Y_test, X_test_wv, clf_rfdt)

              precision    recall  f1-score   support

          -1       0.25      1.00      0.40         1
           1       1.00      0.25      0.40         4

    accuracy                           0.40         5
   macro avg       0.62      0.62      0.40         5
weighted avg       0.85      0.40      0.40         5

