In [1]:
import pandas as pd
from ydata_profiling import ProfileReport

In [2]:
data = pd.read_csv('./data/messages_with_source_url_train.csv')

In [3]:
# profile = ProfileReport(data, title="Profiling Report")
# profile

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

def prepare_X(prevData):
    prevData['text_cleared'].fillna('', inplace=True)

    tfidf_vectorizer = TfidfVectorizer(max_features=250)
    X = tfidf_vectorizer.fit_transform(prevData['text_cleared']).toarray()

    return X

label_encoder = LabelEncoder()

X = prepare_X(data)
data['source_category_encoded'] = label_encoder.fit_transform(data['source_category'])

y = data['source_category_encoded']


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  prevData['text_cleared'].fillna('', inplace=True)


In [5]:
X_over, X_10, Y_over, Y_10, idx_over, idx_10 = train_test_split(X, y, data.index,test_size=0.99, random_state=42)
X_train, X_temp, y_train, y_temp,idx_train, idx_temp = train_test_split(X_10, Y_10, idx_10, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test,idx_val, idx_test = train_test_split(X_temp, y_temp, idx_temp, test_size=0.5, random_state=42)


In [6]:
len(Y_10)

260720

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

# Train Random Forest
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Train K-Nearest Neighbors
knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)


In [8]:
from sklearn.metrics import classification_report, f1_score
import matplotlib.pyplot as plt

# Predictions
rf_preds = rf_model.predict(X_val)
knn_preds = knn_model.predict(X_val)

target_names = [str(cls) for cls in label_encoder.classes_]

# Evaluation
print("Random Forest Classification Report")
print(classification_report(y_val, rf_preds, target_names=target_names))
print("Random Forest F1 Score:", f1_score(y_val, rf_preds, average='weighted'))

print("\nK-Nearest Neighbors Classification Report")
print(classification_report(y_val, knn_preds, target_names=target_names))
print("K-Nearest Neighbors F1 Score:", f1_score(y_val, knn_preds, average='weighted'))


Random Forest Classification Report
                                      precision    recall  f1-score   support

              AGGRESSIVE_INFORMATION       0.47      0.81      0.60     17926
                 AGGRESSIVE_MILITARY       0.16      0.02      0.04      1314
             COORDINATION_OF_ATTACKS       0.14      0.02      0.04       287
           CYBER_ATTACK_COORDINATION       0.72      0.17      0.27       108
ENTITIES_PROMOTING_VIOLENCE_AND_HATE       0.15      0.01      0.02       962
                PERSONAL_INFORMATION       0.12      0.02      0.04       226
              RESTRAINED_INFORMATION       0.43      0.22      0.30     13686
                 RESTRAINED_MILITARY       0.30      0.05      0.08      1406
                        SAFE_CONTENT       0.60      0.32      0.41      1288
                                SPAM       0.20      0.03      0.05       639
              UNRECOGNIZED_REPUBLICS       0.35      0.06      0.11       863
                           

In [9]:
predicted_category_names = label_encoder.inverse_transform(rf_preds)

data.loc[idx_val, 'predicted_category'] = predicted_category_names


In [10]:
data.loc[idx_val].to_csv('./data/validation_messages_with_predicted_category.csv', index=False)

In [11]:
test_data = pd.read_csv('./data/messages_with_source_url_test.csv')


X_test = prepare_X(test_data)
rf_preds_test = rf_model.predict(X_test)

predicted_category_names_test = label_encoder.inverse_transform(rf_preds_test)

test_data['predicted_category'] = predicted_category_names_test


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  prevData['text_cleared'].fillna('', inplace=True)


In [12]:
test_data.loc[1]

source_id                                                        253122
message_id                                                   1408345528
text                  Плохо спится в белую петербургскую ночь. Котор...
impressions                                                      1429.0
reactions                                                           164
shares                                                              3.0
comments                                                              1
published_at                                        2024-06-24 21:32:54
content_type                                                       POST
source_url                                       https://t.me/dtravin61
source_category                                                     NaN
text_cleared          плохо спится белую петербургскую ночь вечер си...
social_network                                                 TELEGRAM
predicted_category                               AGGRESSIVE_INFO

In [13]:
test_data.to_csv('./data/validation_messages_with_predicted_category.csv', index=False)