In [1]:
# Import settings

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm

In [2]:
#Load Train dataset
train = pd.read_csv('data/train.csv')
train = train.fillna('').astype(str)

In [3]:
train.head()

Unnamed: 0,ID,TEXT,LABEL
0,7850790573542594519,If you love good films don't ever buy this pei...,2
1,9392069522632994700,The 33 percent of the nations nitwits that sti...,2
2,5083704536542443514,I saw Anatomy years ago -- dubbed at a friends...,1
3,12418349755186772171,Dark Remains is a home run plain and simple. T...,1
4,12144957944004619479,Feh. This movie started out in an interesting ...,2


In [4]:
#Initialize pre_trained sentiment analyzer "VADER"
sia = SentimentIntensityAnalyzer()
#Store sentiment values (positive, negative, neutral, compound score which gives the direction of sentiment in the texts)
res = {}
for i , row in tqdm(train.iterrows(), total = len(train)):
    text = row['TEXT']
    myid = row['ID']
    res[myid] = sia.polarity_scores(text)

  0%|          | 0/70317 [00:00<?, ?it/s]

In [5]:
#Merge sentiment values with the train data

vaders = pd.DataFrame(res).T
vaders = vaders.reset_index().rename(columns={'index': 'ID'})
train_df = vaders.merge(train, how='left')

In [6]:
#Confirm the outcome of the merged dataframe
train_df.head()

Unnamed: 0,ID,neg,neu,pos,compound,TEXT,LABEL
0,7850790573542594519,0.137,0.474,0.389,0.6996,If you love good films don't ever buy this pei...,2
1,9392069522632994700,0.193,0.699,0.108,-0.968,The 33 percent of the nations nitwits that sti...,2
2,5083704536542443514,0.105,0.76,0.136,0.3469,I saw Anatomy years ago -- dubbed at a friends...,1
3,12418349755186772171,0.131,0.777,0.092,-0.5815,Dark Remains is a home run plain and simple. T...,1
4,12144957944004619479,0.265,0.633,0.102,-0.9981,Feh. This movie started out in an interesting ...,2


In [7]:
#Checking the avearage values of each sentiment categories (including compound score)

average_values = train_df.groupby('LABEL')[['neg', 'neu', 'pos', 'compound']].mean()
print("Average values by Label:")
print(average_values)

Average values by Label:
            neg       neu       pos  compound
LABEL                                        
0      0.041618  0.721098  0.236781  0.503356
1      0.064054  0.757730  0.178214  0.675200
2      0.121956  0.766397  0.111646 -0.083287


In [8]:
# Assign text, compound and labels for splitting the data
X_text = train_df['TEXT']  # Text of the data
X_compound = train_df[['compound', 'neg', 'pos']]  # Add 'compound', 'neg', 'pos' scores as a feature
y = train_df['LABEL']

# Splitting the Data (Validation_size: 20%, Train_size = 80%)
X_train_text, X_test_text, X_train_compound, X_test_compound, y_train, y_test = train_test_split(
    X_text, X_compound, y, test_size=0.2, random_state=42
)

# Concatenate text and compound score data
X_train_combined = pd.concat([X_train_text, X_train_compound], axis=1)
X_test_combined = pd.concat([X_test_text, X_test_compound], axis=1)

'''
Pipeline for Vectorization and Model Training:
    vectorizer = TfidfVectorizer (N-gram = (1,3), min_df = 5, max_df = 0.4) ignore terms that appear less than 5, 
                                  and ignore terms that appear more than 40% of the document)
    compound_preprocessor = adjusting sentiment scores having a mean of 0 and a standard devieation of 1
'''
vectorizer = TfidfVectorizer(ngram_range = (1,3), min_df = 5, max_df = 0.4)
compound_preprocessor = StandardScaler()


# Use sklearn.compose.Columntransformer to apply vectorization of terms and scaling of compound scores
preprocessor = ColumnTransformer(
    transformers=[
        ('text', vectorizer, 'TEXT'),
        ('compound', compound_preprocessor, ['compound', 'neg', 'pos'])
    ]
)

# Build classifier
model = Pipeline(
    steps=[
        # Preprocess the data with vectorization of terms & scaling of compound score
        ('preprocessor', preprocessor),
        # Logistic Regression classifier
        ('classifier', LogisticRegression(max_iter = 1000, 
                                          C = 50, 
                                          solver = 'saga', 
                                          multi_class = 'multinomial',
                                          class_weight = 'balanced',
                                         ))
    ]
)

# Train the Model
model.fit(X_train_combined, y_train)

# Prediction with the validation set
y_pred = model.predict(X_test_combined)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.97      0.99      0.98      6454
           1       0.90      0.89      0.90      3856
           2       0.91      0.90      0.90      3754

    accuracy                           0.94     14064
   macro avg       0.93      0.92      0.93     14064
weighted avg       0.94      0.94      0.94     14064



In [9]:
# Print macro F1-Score
f1 = f1_score(y_test, y_pred, average = 'macro')
print(f1)


0.9264813555279808


In [12]:
# Load test dataset
test = pd.read_csv('data/test.csv')

# Fill NA values in TEXT to ""
test = test.fillna('').astype(str)



In [13]:
# Use VADER for calculating compound score in the test data
res_test = {}
for i , row in tqdm(test.iterrows(), total = len(test)):
    text = row['TEXT']
    myid = row['ID']
    res_test[myid] = sia.polarity_scores(text)


#Merge sentiment values with the test data

test_vader = pd.DataFrame(res_test).T
test_vader = test_vader.reset_index().rename(columns={'index': 'ID'})
test_df = test_vader.merge(test, how='left')

  0%|          | 0/17580 [00:00<?, ?it/s]

In [14]:
# Confirm the outcome of the merged data frame
test_df.head()

Unnamed: 0,ID,neg,neu,pos,compound,TEXT
0,4728459160322025755,0.038,0.856,0.106,0.8439,An excellent debut movie for the the director ...
1,1840432070229003467,0.017,0.9,0.083,0.9768,If you have a preschooler or remember how stre...
2,12623336783082722606,0.151,0.737,0.111,-0.974,What should have been a routine babysitting gi...
3,7446733850828603409,0.0,0.333,0.667,0.25,Cute but
4,16180660281866613068,0.064,0.826,0.11,0.8659,"Elvis Presley plays a ""half-breed"" Native Amer..."


In [15]:
# Concatenate the 'TEXT' column from df_test with the 'compound', 'neg', and 'pos' scores and extract features
X_test = test_df[['TEXT', 'compound', 'neg', 'pos']] 

# Make predictions using the pipeline
predictions = model.predict(X_test)

In [16]:
#Create a data frame for the submission of the prediction columns: ID, LABEL
submission = pd.DataFrame(columns=['ID','LABEL'])
submission['ID'] = test_df['ID']
submission['LABEL'] = predictions

#Check the dataframe
submission.head()

Unnamed: 0,ID,LABEL
0,4728459160322025755,1
1,1840432070229003467,1
2,12623336783082722606,2
3,7446733850828603409,0
4,16180660281866613068,2


In [17]:
#Save as a .csv file
submission.to_csv('submission_v10.csv', index = False)