# Initialisation

In [100]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

OUTPUT_FOLDER = '../model/'

seed = 88
    


In [101]:
column_names = ["target", "id", "date", "flag", "user", "text"]

# Read the CSV file with specified column names
df = pd.read_csv("../dataset/training.1600000.processed.noemoticon.csv", 
                 encoding="ISO-8859-1", names=column_names)

def reduce_sample(df, frac, random_state):
    df = df.sample(frac=frac, random_state=random_state)
    df = df.reset_index(drop=True)
    return df

df = reduce_sample(df, 0.3, seed)
# df_cleaned = reduce_sample(df_cleaned, 0.1, 42)

Columns in dataset

In [102]:
print("Columns in the original dataset:\n")
print(df.columns)

Columns in the original dataset:

Index(['target', 'id', 'date', 'flag', 'user', 'text'], dtype='object')


Example of an Row in dataset

In [103]:
df.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1974058893,Sat May 30 12:21:30 PDT 2009,NO_QUERY,BrookeAmanda,Ok it's only been a couple hours since dad has...
1,4,1998068077,Mon Jun 01 17:56:23 PDT 2009,NO_QUERY,KarinaKornacka,@graceofrhythm HAHA no i would never do that!!...
2,4,1999729993,Mon Jun 01 20:43:12 PDT 2009,NO_QUERY,stevegaghagen,Law of Attraction Creations: Law of Attraction...
3,4,2006627206,Tue Jun 02 11:26:46 PDT 2009,NO_QUERY,Hecie,is ordering ticketsssss EEEE (: &lt;3
4,4,1991292674,Mon Jun 01 06:46:16 PDT 2009,NO_QUERY,shanaloren,@STO_MAC nah im not mad at u....luv u too


In [104]:
df.describe()

Unnamed: 0,target,id
count,480000.0,480000.0
mean,1.997267,1999221000.0
std,2.0,193630000.0
min,0.0,1467810000.0
25%,0.0,1956973000.0
50%,0.0,2002237000.0
75%,4.0,2177253000.0
max,4.0,2329205000.0


Clean data

In [105]:
def clean_dataset(df, drop_columns):
    df_cleaned = df.dropna()
    df_cleaned = df_cleaned.drop_duplicates()
    df_cleaned = df_cleaned.drop(columns=drop_columns)
    df_cleaned = df_cleaned.reset_index(drop=True)
    df_cleaned.describe()
    return df_cleaned

df_cleaned = clean_dataset(df, ["date", "id", "flag", "user"])

In [106]:
df_cleaned.head()

Unnamed: 0,target,text
0,0,Ok it's only been a couple hours since dad has...
1,4,@graceofrhythm HAHA no i would never do that!!...
2,4,Law of Attraction Creations: Law of Attraction...
3,4,is ordering ticketsssss EEEE (: &lt;3
4,4,@STO_MAC nah im not mad at u....luv u too


Remove twitter tag and hashtag

In [107]:
import re
def remove_hashtag(df_cleaned):
    df_cleaned['text'] = df_cleaned['text'].apply(lambda x: re.sub(r"http\S+|@\S+|#\S+", "", x))
    return df_cleaned

df_cleaned = remove_hashtag(df_cleaned)

In [108]:
df_cleaned.head()

Unnamed: 0,target,text
0,0,Ok it's only been a couple hours since dad has...
1,4,HAHA no i would never do that!!! I actually m...
2,4,Law of Attraction Creations: Law of Attraction...
3,4,is ordering ticketsssss EEEE (: &lt;3
4,4,nah im not mad at u....luv u too


convert target back to -1 0 1


In [109]:
def convert_to_unitary_target(df_cleaned, target_column):
    df_cleaned[target_column] = df_cleaned[target_column].map({0: -1, 2: 0, 4: 1})
    return df_cleaned

df_cleaned = convert_to_unitary_target(df_cleaned, 'target')
df_cleaned['target'].value_counts()

target
-1    240328
 1    239672
Name: count, dtype: int64

Tokenisation

In [110]:
def tokenize_text(df_cleaned, text_column, tokenized_text_column):
    from gensim.utils import simple_preprocess
    # Tokenize the text column to get the new column 'tokenized_text'
    df_cleaned[tokenized_text_column] = [simple_preprocess(line, deacc=True) for line in df_cleaned[text_column]]
    return df_cleaned
    
df_cleaned = tokenize_text(df_cleaned, 'text', 'tokenized_text')
df_cleaned['tokenized_text'].head()

0    [ok, it, only, been, couple, hours, since, dad...
1    [haha, no, would, never, do, that, actually, m...
2    [law, of, attraction, creations, law, of, attr...
3                [is, ordering, ticketsssss, eeee, lt]
4                    [nah, im, not, mad, at, luv, too]
Name: tokenized_text, dtype: object

# Stemming & Lemma

In [111]:
df_to_be_stemmed = df_cleaned.copy()

### PoterStammer

In [112]:
def porter_stemmer_on_text(df_to_be_stemmed, token_text_column, stemmed_text_column):
    from gensim.parsing.porter import PorterStemmer
    porter_stemmer = PorterStemmer()
    df_potter_stemmed = df_to_be_stemmed.copy()
    # Get the stemmed_tokens
    df_potter_stemmed[stemmed_text_column] = [[porter_stemmer.stem(word) for word in tokens] 
                                        for tokens in df_potter_stemmed[token_text_column]]  
    return df_potter_stemmed



0    [ok, it, onli, been, coupl, hour, sinc, dad, h...
1    [haha, no, would, never, do, that, actual, mad...
2    [law, of, attract, creation, law, of, attract,...
3                   [is, order, ticketsssss, eeee, lt]
4                    [nah, im, not, mad, at, luv, too]
5    [centuri, room, tast, the, rainbow, with, your...
6    [ari, also, got, servic, award, for, the, comm...
7    [man, thought, somethin, wa, fina, go, done, s...
8           [leav, moscow, when, it, final, get, warm]
9                           [wish, got, summer, break]
Name: stemmed_text, dtype: object

### Lancaster

In [113]:
def lancaster_stemmer_on_text(df_to_be_stemmed, token_text_column, stemmed_text_column):
    from nltk.stem.lancaster import LancasterStemmer
    lancaster_stemmer = LancasterStemmer()
    df_lancaster_stemmed = df_to_be_stemmed.copy()
    # Get the stemmed_tokens
    df_lancaster_stemmed[stemmed_text_column] = [[lancaster_stemmer.stem(word) for word in tokens] 
                                        for tokens in df_lancaster_stemmed[token_text_column]]
    
    return df_lancaster_stemmed


### Snowball

In [114]:
def snowball_stemmer_on_text(df_to_be_stemmed, token_text_column, stemmed_text_column):
    from nltk.stem.snowball import EnglishStemmer
    snowball_stemmer = EnglishStemmer()
    df_snowball_stemmed = df_to_be_stemmed.copy()
    # Get the stemmed_tokens
    df_snowball_stemmed[stemmed_text_column] = [[snowball_stemmer.stem(word) for word in tokens] 
                                        for tokens in df_snowball_stemmed[token_text_column]]
    
    return df_snowball_stemmed


### Lemmatisation

In [115]:
def lemmatize_text(df_to_be_stemmed, token_text_column, lemmatized_text_column):
    from nltk.stem import WordNetLemmatizer
    wordnet_lemmatizer = WordNetLemmatizer()
    df_lemmatized = df_to_be_stemmed.copy()
    
    # Get the lemmatized_tokens
    df_lemmatized[lemmatized_text_column] = [[wordnet_lemmatizer.lemmatize(word) for word in tokens] 
                                          for tokens in df_lemmatized[token_text_column]]


In [None]:
df_potter_stemmed = porter_stemmer_on_text(df_to_be_stemmed, 'tokenized_text', 'stemmed_text')
df_potter_stemmed['stemmed_text'].head(10)    

## Split into Train and Test Sets

- Train data ( Subset of data for training ML Model) ~70%
- Test data (Subset of data for testing ML Model trained from the train data)

In [116]:
from sklearn.model_selection import train_test_split

def split_train_test(data, sentiment_value_col, tokenised_text_col, test_size=0.3, shuffle_state=True):
    X_train, X_test, Y_train, Y_test = train_test_split( data[tokenised_text_col],
                                                        data[sentiment_value_col], 
                                                        shuffle=shuffle_state,
                                                        test_size=test_size, 
                                                        random_state=15)
    print("Value counts for Train sentiments")
    print(Y_train.value_counts())
    print("Value counts for Test sentiments")
    print(Y_test.value_counts())
    print(type(X_train))
    print(type(Y_train))
    X_train = X_train.reset_index()
    X_test = X_test.reset_index()
    Y_train = Y_train.to_frame()
    Y_train = Y_train.reset_index()
    Y_test = Y_test.to_frame()
    Y_test = Y_test.reset_index()
    print(X_train.head())
    return X_train, X_test, Y_train, Y_test

X_train, X_test, Y_train, Y_test = split_train_test(df_potter_stemmed, 'target', 'stemmed_text')

Value counts for Train sentiments
target
-1    168133
 1    167867
Name: count, dtype: int64
Value counts for Test sentiments
target
-1    72195
 1    71805
Name: count, dtype: int64
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
    index                                       stemmed_text
0  266527  [lil, is, move, to, alabama, at, the, end, of,...
1  401839  [hangov, and, unfortun, there, lot, of, work, ...
2  218733                 [damn, won, thi, round, homi, lol]
3  435715       [dont, you, worri, hear, the, tequila, call]
4  203570  [wish, could, join, you, on, fridai, but, on, ...


# Word2Vec 

## Save-gram approach

### Generate model

In [117]:
def generate_word2vec_model(stemmed_df, filename, stem_column_name):
    from gensim.models import Word2Vec
    # Skip-gram model (sg = 1)
    vector_size = 1000
    window = 3
    min_count = 1
    workers = 3
    sg = 1
    filename = filename + ".model"
    stemmed_tokens = pd.Series(stemmed_df[stem_column_name]).values
    # Train the Word2Vec Model
    w2v_model = Word2Vec(stemmed_tokens, min_count = min_count, vector_size = vector_size, workers = workers, window = window, sg = sg)
    w2v_model.save(filename)
    
    return w2v_model




### Load model

In [118]:
import numpy as np
from gensim.models import Word2Vec
# Load the model from the model file
vector_size = 1000
window = 3
min_count = 1
workers = 3
sg = 1

def load_word2vec_model(reduced_mode_file, stem_column_name):
    sg_w2v_model = Word2Vec.load(reduced_mode_file)
    sg_w2v_model_wv = sg_w2v_model.wv
    # Unique ID of the word
    print("Index of the word 'action':")
    print(sg_w2v_model_wv.key_to_index["action"])
    # Total number of the words 
    print(len(sg_w2v_model_wv.key_to_index))
    # Print the size of the word2vec vector for one word
    print("Length of the vector generated for a word")
    print(len(sg_w2v_model_wv['action']))
    # Get the mean for the vectors for an example review
    print("Print the length after taking average of all word vectors in a sentence:")
    print(np.mean([sg_w2v_model_wv[token] for token in df_potter_stemmed[stem_column_name][0]], axis=0))
    return sg_w2v_model
    
sg_w2v_model = load_word2vec_model("../model/word2vec_1000savegram.model", 'stemmed_text')
sg_w2v_model_wv = sg_w2v_model.wv

Index of the word 'action':
1725
212909
Length of the vector generated for a word
1000
Print the length after taking average of all word vectors in a sentence:
[ 1.10997356e-01  1.29606901e-02  1.31717682e-01  1.03594407e-01
 -1.26589434e-02 -5.93637340e-02  4.32857983e-02  7.32175168e-03
 -1.02565773e-01  1.25884488e-01  1.30147506e-02 -4.81054820e-02
  3.28933215e-03  3.36098783e-02  1.23340942e-01 -2.79482193e-02
 -1.01861104e-01  1.62472036e-02  1.87799558e-02 -2.11371675e-01
  8.02890286e-02 -2.31562387e-02  4.07561325e-02  1.74559071e-03
  7.90047348e-02 -3.33950645e-03  9.08088088e-02 -3.37177813e-02
 -2.34114110e-01  7.87946358e-02  7.36722648e-02 -7.28675872e-02
 -3.23941149e-02 -1.14671990e-01  1.14956014e-01  2.42509134e-02
  5.61343543e-02 -3.91341671e-02 -1.00916252e-01 -1.79448381e-01
 -1.53343290e-01  7.30233118e-02 -6.34375215e-02  1.03500888e-01
 -8.34881514e-02 -3.69986929e-02 -1.38792112e-01  1.04046136e-01
 -1.43292069e-01  2.93894075e-02 -4.37203376e-03 -3.57635505

In [119]:
def save_word2vec_to_csv(X_set, sg_w2v_model_wv, stem_col, word2vec_filename):
    with open(word2vec_filename, 'w+') as word2vec_file:
        for index, row in X_set.iterrows():
            model_vector = (np.mean([sg_w2v_model_wv[token] for token in row[stem_col]], axis=0))
            v_norm = model_vector / (np.linalg.norm(model_vector) + 1e-16)
            v_norm = v_norm.tolist()
            if index == 0:
                header = ",".join(str(ele) for ele in range(1000))
                word2vec_file.write(header)
                word2vec_file.write("\n")
            # Check if the line exists else it is vector of zeros
            if type(v_norm) is list:  
                line1 = ",".join( [str(vector_element) for vector_element in v_norm] )
            else:
                line1 = ",".join([str(0) for i in range(1000)])
            word2vec_file.write(line1)
            word2vec_file.write('\n')
           
def save_word2vec_to_csv_round(X_set, sg_w2v_model_wv, stem_col, word2vec_filename):
    with open(word2vec_filename, 'w+') as word2vec_file:
        for index, row in X_set.iterrows():
            model_vector = (np.mean([sg_w2v_model_wv[token] for token in row[stem_col]], axis=0))
            v_norm = model_vector / (np.linalg.norm(model_vector) + 1e-16)
            v_norm = np.round(v_norm, decimals = 6 ).tolist()
            if index == 0:
                header = ",".join(str(ele) for ele in range(1000))
                word2vec_file.write(header)
                word2vec_file.write("\n")
            # Check if the line exists else it is vector of zeros
            if type(v_norm) is list:  
                line1 = ",".join( [str(vector_element) for vector_element in v_norm] )
            else:
                line1 = ",".join([str(0) for i in range(1000)])
            word2vec_file.write(line1)
            word2vec_file.write('\n') 
    
train_X_word2vec_filename = "../model/train_X_0.3_wv.csv"
test_X_word2vec_filename = "../model/test_X_0.3_wv.csv"

train_X_word2vec_rounded_filename = "../model/train_X_full_wv.csv"
test_X_word2vec_rounded_filename = "../model/test_X_full_wv.csv"

save_word2vec_to_csv(X_train, sg_w2v_model_wv, "stemmed_text", train_X_word2vec_filename)
save_word2vec_to_csv(X_test, sg_w2v_model_wv, "stemmed_text", test_X_word2vec_filename)


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


# Training Model

## Load Training and Testing Set

In [120]:
def load_train_w2v_from_csv(word2vec_filename):
    train_word2vec_df = pd.read_csv(word2vec_filename)
    return train_word2vec_df

def load_test_wv_w2v_from_csv(test_X_word2vec_filename):
    return pd.read_csv(test_X_word2vec_filename)

X_train_wv = load_train_w2v_from_csv(train_X_word2vec_filename)

X_train_wv.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
0,0.048102,0.015179,0.042033,0.03247,-0.000219,-0.003463,0.040373,0.010535,-0.047874,0.038799,...,0.003958,-0.026251,0.041925,-0.023194,0.032529,-0.014211,-0.028527,-0.027768,-0.004475,-0.04472
1,0.044427,0.015315,0.045554,0.037365,-0.001599,-0.007755,0.014855,0.012154,-0.053852,0.039776,...,-0.006429,-0.029311,0.030579,-0.026832,0.031129,0.007292,-0.018692,-0.0348,-0.008332,-0.042234
2,0.038869,0.012879,0.054727,0.045526,0.000535,-0.022,0.039736,0.025695,-0.075243,0.055876,...,0.027775,-0.019863,0.000699,-0.000591,0.037935,-0.007918,0.000477,-0.032762,0.005987,-0.036511
3,0.04919,0.018796,0.036048,0.039769,0.018974,-0.033573,0.011563,0.004603,-0.053011,0.063128,...,0.000155,-0.015801,0.025616,-0.017155,0.030357,-0.015505,-0.021535,-0.054803,-0.007507,-0.02485
4,0.034049,0.006086,0.031242,0.038,-0.010132,-0.01094,0.038614,0.013778,-0.037274,0.035137,...,-0.003304,-0.030206,0.025648,-0.028249,0.020745,-0.020653,-0.022377,-0.03211,-0.002292,-0.054542


In [121]:
X_test_wv = load_train_w2v_from_csv(test_X_word2vec_filename)
X_test_wv.head()    


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
0,0.053024,0.028613,0.024006,0.042881,-0.021136,-0.024381,0.010766,0.003945,-0.056567,0.060535,...,-0.012153,-0.030685,0.021339,-0.007457,0.032346,-0.013682,-0.02079,-0.050811,0.007726,-0.050212
1,0.047366,0.016169,0.018548,0.023616,0.017228,0.00013,0.020927,0.008233,-0.049627,0.044406,...,-0.011611,-0.038126,0.038818,-0.016775,0.032452,-0.020109,-0.023841,-0.052119,0.019372,-0.030337
2,0.040361,0.025301,0.042857,0.043931,0.005333,-0.005833,0.011292,0.012258,-0.065365,0.03735,...,0.013502,-0.03574,0.031067,-0.0185,0.046343,-0.025007,0.00085,-0.047883,0.010271,-0.032912
3,0.053678,0.018155,0.054913,0.03718,0.017946,-0.018485,0.012916,0.031583,-0.064868,0.044907,...,-0.007292,-0.027373,0.038207,-0.029403,0.048433,-0.023186,-0.020249,-0.044426,-0.002989,-0.03768
4,0.049551,-0.00225,0.021974,0.021378,0.000146,-0.006801,0.026745,0.022748,-0.040269,0.015219,...,-0.0059,-0.02656,0.057077,0.00304,0.013152,-0.028337,-0.028905,-0.042834,-0.006099,-0.042736


## Decision Tree Classifier

In [122]:
def train_decision_tree_word2vec(X_train_wv, Y_train):
    from sklearn.tree import DecisionTreeClassifier
    #Initialize the model
    clf_decision_word2vec = DecisionTreeClassifier()
    # Fit the model
    clf_decision_word2vec.fit(X_train_wv, Y_train['target'])
    
    import joblib
    joblib.dump(clf_decision_word2vec, OUTPUT_FOLDER + 'clf_decision_word2vec.pkl')
    
    return clf_decision_word2vec

train_decision_tree_word2vec(X_train_wv, Y_train)

Testing the model

In [None]:
def test_decision_tree_word2vec(Y_test, X_test_wv):
    from sklearn.metrics import classification_report
    from joblib import load
    # Load the model from the file
    clf_decision_word2vec = load("../model/decision_tree_word2vec.pkl")
    test_predictions_word2vec = clf_decision_word2vec.predict(X_test_wv)

    print(classification_report(Y_test['target'], test_predictions_word2vec))
    
test_decision_tree_word2vec(Y_test, X_test_wv)

## SGDClassifier

In [None]:
def train_sgd_clf(Y_train):
    from sklearn.linear_model import Perceptron
    #Initialize the model
    clf = Perceptron()
    batch = 100000
    classes = [-1, 1]
    for i in range(0, len(Y_train), batch):
        print("Training the model with batch size of 100000 of batch ", i)
        X_train_wv = pd.read_csv(train_X_word2vec_filename, skiprows=i, nrows=batch, header=1)
        train_target = Y_train['target'][i:i+batch]
        # Fit the model
        clf.partial_fit(X_train_wv, train_target, classes = classes)
    
    import joblib
    joblib.dump(clf, OUTPUT_FOLDER + 'sgd_T160_full_batched.pkl')
    
    return clf



Training the model with batch size of 100000 of batch  0
Training the model with batch size of 100000 of batch  100000


ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- -0.0001527181448182091
- -0.00019450907711870968
- -0.0002684208157006651
- -0.00027748121647164226
- -0.0003566486411727965
- ...
Feature names seen at fit time, yet now missing:
- -0.00013550929725170135
- -0.00016435066936537623
- -0.00020373801817186177
- -0.00028594405739568174
- -0.00034137649345211685
- ...


## SVM

### Standard

In [None]:
def train_svc_word2vec(X_train_wv, Y_train):
    from sklearn.svm import SVC
    #Initialize the model
    svm_classifier = SVC()

    # Fit the model
    svm_classifier.fit(X_train_wv, Y_train['target'])
    
    import joblib
    joblib.dump(svm_classifier, OUTPUT_FOLDER + 'svm_classifier.pkl')
    
    return svm_classifier

In [None]:
def test_svc_word2vec(Y_test, X_test_wv):
    from sklearn.metrics import classification_report
    from joblib import load
    svm_classifier = load(OUTPUT_FOLDER + 'svm_classifier.pkl')
    reduced_test_features_word2vec = pd.DataFrame(X_test_wv).sample(frac=0.1, random_state=42)
    test_predictions_word2vec_svm = svm_classifier.predict(reduced_test_features_word2vec)
    print(len(test_predictions_word2vec_svm))
    
    print(classification_report(pd.Series(Y_test['target']).sample(frac=0.1, random_state=42),test_predictions_word2vec_svm))
    


### with standard scaler

Do not be a retard: low C == less rigid: more spaces for error: train faster

C = 1.0 take 4 hours

C = 2.0 takes more than 6hours before terminated


In [None]:
def train_svc_word2vec_general(X_train_wv, Y_train, C, kernel):
    from sklearn.svm import SVC
    from sklearn.pipeline import make_pipeline
    from sklearn.preprocessing import StandardScaler
    #Initialize the model
    clf = make_pipeline(StandardScaler(), SVC(gamma='auto', kernel = kernel, C = C))
    # Fit the model
    clf.fit(X_train_wv, Y_train['target'])
    import joblib
    joblib.dump(clf, OUTPUT_FOLDER + 'svm_classifier_C0.2_linear.pkl')
    
    return clf

clf = train_svc_word2vec_general(X_train_wv, Y_train, 0.2, 'linear')

In [None]:
def test_svc_word2vec_general(Y_test, X_test_wv, clf):
    from sklearn.metrics import classification_report
    # from joblib import load
    # clf = load(OUTPUT_FOLDER + 'svm_classifier_scl_linear.pkl')
    test_predictions_word2vec_svm_scaled = clf.predict(X_test_wv)

    print(classification_report(Y_test['target'], test_predictions_word2vec_svm_scaled))

test_svc_word2vec_general(Y_test, X_test_wv, clf)

## RandomForestClassifier

In [None]:
def train_random_forest_clf(X_train_wv, Y_train):
    from sklearn.ensemble import RandomForestClassifier
    #Initialize the model
    clf_decision_word2vec = RandomForestClassifier()
    # Fit the model
    clf_decision_word2vec.fit(X_train_wv, Y_train['target'])
    
    import joblib
    joblib.dump(clf_decision_word2vec, OUTPUT_FOLDER + 'random_forest_dt_clf.pkl')
    
    return clf_decision_word2vec

clf_rfdt = train_random_forest_clf(X_train_wv, Y_train)

In [None]:
def test_random_forest_clf(Y_test, X_test_wv, clf):
    from sklearn.metrics import classification_report
    # from joblib import load
    # clf = load(OUTPUT_FOLDER + 'svm_classifier_scl_linear.pkl')
    test_predictions_word2vec_svm_scaled = clf.predict(X_test_wv)

    print(classification_report(Y_test['target'], test_predictions_word2vec_svm_scaled))

test_random_forest_clf(Y_test, X_test_wv, clf_rfdt)