In [None]:
import pandas as pd
from ydata_profiling import ProfileReport

In [None]:
data = pd.read_csv('./data/messages_source_url_merged_lemma_train.csv')
data.loc[1]

In [None]:
# profile = ProfileReport(data, title="Profiling Report")
# profile

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy

def prepare_X(prevData):
    prevData['text_lemmatized'].fillna('', inplace=True)

    additional_features = prevData[['impressions', 'reactions', 'shares', 'comments']].fillna(0)

    tfidf_vectorizer = TfidfVectorizer(max_features=1000)
    X_Text = tfidf_vectorizer.fit_transform(prevData['text_lemmatized']).toarray()

    scaler = StandardScaler()
    X_additional = scaler.fit_transform(additional_features)

    X = numpy.hstack([X_Text, X_additional])

    return X

label_encoder = LabelEncoder()

X = prepare_X(data)
data['source_category_encoded'] = label_encoder.fit_transform(data['source_category'])

y = data['source_category_encoded']


In [None]:
X_over, X_10, Y_over, Y_10, idx_over, idx_10 = train_test_split(X, y, data.index,test_size=0.99, random_state=42)
X_train, X_temp, y_train, y_temp,idx_train, idx_temp = train_test_split(X_10, Y_10, idx_10, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test,idx_val, idx_test = train_test_split(X_temp, y_temp, idx_temp, test_size=0.5, random_state=42)


In [None]:
len(Y_10)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

# Train Random Forest
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Train K-Nearest Neighbors
knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)


In [None]:
from sklearn.metrics import classification_report, f1_score
import matplotlib.pyplot as plt

# Predictions
rf_preds = rf_model.predict(X_val)
knn_preds = knn_model.predict(X_val)

target_names = [str(cls) for cls in label_encoder.classes_]

# Evaluation
print("Random Forest Classification Report")
print(classification_report(y_val, rf_preds, target_names=target_names))
print("Random Forest F1 Score:", f1_score(y_val, rf_preds, average='weighted'))

print("\nK-Nearest Neighbors Classification Report")
print(classification_report(y_val, knn_preds, target_names=target_names))
print("K-Nearest Neighbors F1 Score:", f1_score(y_val, knn_preds, average='weighted'))


In [None]:
predicted_category_names = label_encoder.inverse_transform(rf_preds)

data.loc[idx_val, 'predicted_category'] = predicted_category_names


In [None]:
data.loc[idx_val].to_csv('./data/validation_messages_with_predicted_category.csv', index=False)

In [None]:
test_data = pd.read_csv('./data/messages_source_url_merged_lemma_test.csv')


X_test_data = prepare_X(test_data)
rf_preds_test = rf_model.predict(X_test_data)

predicted_category_names_test = label_encoder.inverse_transform(rf_preds_test)

test_data['predicted_category'] = predicted_category_names_test


In [None]:
len(predicted_category_names_test)

In [None]:
test_data.loc[1]

In [None]:
test_data.to_csv('./data/validation_messages_with_predicted_category.csv', index=False)