In [None]:
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
import pickle

In [None]:
# Load the dataset using pandas
df = pd.read_csv('data/train_clean.csv')

In [None]:
# Split the dataset into comments and labels
comments = df['comment'].tolist()
labels = df['class'].tolist()

In [None]:
# load embeddings
embeddings = np.load('model/embeddings.npy')

In [None]:
# Reshape the embeddings array
reshaped_embeddings = embeddings.reshape(embeddings.shape[0], -1)

In [None]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(reshaped_embeddings, labels, test_size=0.2, random_state=42, stratify=labels)

Note: here we used the `PassiveAggressiveClassifier` because the embeddings size is too big. and the only solustion we have is to use increamental leaning.

That's why here we devided the data into batches and and used partial_fit for training

In [None]:
batch_size = 10
num_embeddings = reshaped_embeddings.shape[0]
num_batches = (num_embeddings + batch_size - 1) // batch_size

# Train a Passive Aggressive Classifier
pac = PassiveAggressiveClassifier(random_state=42)
for i in range(num_batches):
    start_idx = i * batch_size
    end_idx = min((i + 1) * batch_size, num_embeddings)
    pac.partial_fit(reshaped_embeddings[start_idx:end_idx], labels[start_idx:end_idx], classes=[0, 1])

In [None]:
# Predict the labels for the test set
y_pred = pac.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

In [None]:
# save the model
pickle.dump(pac, open("model/pac.sav", 'wb'))