In [15]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split,GridSearchCV,RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn import svm
from sklearn.model_selection import StratifiedKFold, train_test_split
import itertools
from sklearn.decomposition import PCA
import joblib
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [5]:
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
import warnings
warnings.filterwarnings('ignore')

In [6]:
train_emoticon_df = pd.read_csv('datasets/train/train_emoticon.csv')
train_text_seq_df = pd.read_csv('datasets/train/train_text_seq.csv')
train_feature_data = np.load('datasets/train/train_feature.npz')

val_emoticon_df = pd.read_csv('datasets/valid/valid_emoticon.csv')
val_text_seq_df = pd.read_csv('datasets/valid/valid_text_seq.csv')
val_feature_data = np.load('datasets/valid/valid_feature.npz')

In [7]:
labels = train_emoticon_df['label']
val_labels=val_emoticon_df['label']

In [8]:
#ONE HOT ENCODING THE EMOTICON DATA

train_emoticon_X = train_emoticon_df['input_emoticon'].to_list()
val_emoticon_X = val_emoticon_df['input_emoticon'].to_list()

encoder = OneHotEncoder(sparse_output=False,handle_unknown='ignore')
all_emojis = list(set(''.join(train_emoticon_X)))

encoder.fit([[emoji] for emoji in all_emojis])

encoded_X_train = [encoder.transform([[emoji] for emoji in sequence]) for sequence in train_emoticon_X]
encoded_X_train_flat = np.array([list(itertools.chain.from_iterable(list_of_lists)) for list_of_lists in encoded_X_train])


encoded_X_val = [encoder.transform([[emoji] for emoji in sequence]) for sequence in val_emoticon_X]
encoded_X_val_flat = np.array([list(itertools.chain.from_iterable(list_of_lists)) for list_of_lists in encoded_X_val])
encoded_X_train_flat.shape

(7080, 2782)

In [9]:
# Extracting features and labels from the .npz file
deep_features = train_feature_data['features']
val_deep_features = val_feature_data['features']
len(deep_features[0][0])

768

In [10]:
n_samples_train = deep_features.shape[0]
n_samples_val = val_deep_features.shape[0]

X_train_flattened = deep_features.reshape(n_samples_train, -1)  # Shape: (n_samples, 10218)
X_val_flattened = val_deep_features.reshape(n_samples_val, -1)

pca = PCA(n_components = 7000)
pca.fit(X_train_flattened)
X_train_flattened_pca = pca.transform(X_train_flattened)
X_val_flattened_pca = pca.transform(X_val_flattened)

X_train_flattened_pca.shape

(7080, 7000)

In [11]:
text_seq_features = np.array([list(map(int, list(seq))) for seq in train_text_seq_df['input_str']])
val_text_seq_features=np.array([list(map(int, list(seq))) for seq in val_text_seq_df['input_str']])
text_seq_features.shape

(7080, 50)

In [12]:
combined_features = np.hstack([encoded_X_train_flat, X_train_flattened_pca, text_seq_features])
val_combined_features=np.hstack([encoded_X_val_flat, X_val_flattened_pca, val_text_seq_features])
combined_features.shape

(7080, 9832)

In [18]:
model = LogisticRegression(C = 100, solver = 'lbfgs', penalty = 'l2')
model.fit(combined_features, labels)

In [19]:
y_pred = model.predict(val_combined_features)
accuracy = accuracy_score(val_labels, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print(classification_report(val_labels, y_pred))

Accuracy: 0.99
              precision    recall  f1-score   support

           0       0.98      0.99      0.99       252
           1       0.99      0.98      0.99       237

    accuracy                           0.99       489
   macro avg       0.99      0.99      0.99       489
weighted avg       0.99      0.99      0.99       489



In [20]:
joblib.dump(pca, 'PCA_for_combined_model.pkl')
joblib.dump(model, 'Trained_combined_model.pkl')

['Trained_combined_model.pkl']