## Basic Setup
It is advisable to mount a certain gdrive folder to streamline the work

In [None]:
from google.colab import drive
from pathlib import Path
import os

# Mount Google Drive
drive.mount('/content/drive')

# Define and create the output folder
base_path = Path('/content/drive/My Drive/ColabOutputs')
base_path.mkdir(parents=True, exist_ok=True)  # Create folder if it doesn't exist



In [None]:
import pandas as pd
import numpy as np
from collections import Counter
from scipy.stats import entropy
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sentence_transformers import SentenceTransformer

### Load Data

In [None]:
df = pd.read_csv(base_path / "train_cleaned_user.csv")
df['row_id'] = np.arange(len(df))
df['Tweets'] = df['Tweets'].fillna('')
df['datex'] = pd.to_datetime(df['datex'])

### Generate Embeddings Using BERTSentence

In [None]:
# SentenceBERT embedding
user_grouped = df.groupby('User')['Tweets'].apply(lambda x: ' '.join(x)).reset_index()
model = SentenceTransformer('all-MiniLM-L6-v2')
user_grouped['embedding'] = user_grouped['Tweets'].apply(lambda x: model.encode(x))
embedding_matrix = np.vstack(user_grouped['embedding'].values)

### Function for User Metrics

In [None]:
# Post count per active day
def active_days(dates):
    return len(set(dates.dt.date))

user_dates = df.groupby('User')['datex'].apply(list).reset_index()
user_dates['active_days'] = user_dates['datex'].apply(lambda dates: active_days(pd.Series(dates)))
user_dates['post_count'] = df.groupby('User').size().values
user_dates['avg_post_per_day'] = user_dates['post_count'] / user_dates['active_days']
user_grouped = user_grouped.merge(user_dates[['User', 'avg_post_per_day']], on='User')

# Burstiness and average tweet length
def compute_burstiness(dates):
    if len(dates) < 2:
        return 0
    dates_sorted = sorted(dates)
    gaps = [(dates_sorted[i+1] - dates_sorted[i]).total_seconds() / 3600 for i in range(len(dates_sorted)-1)]
    return np.std(gaps)

burstiness_df = df.groupby('User')['datex'].apply(compute_burstiness).reset_index(name='burstiness')
avg_len_df = df.groupby('User')['Tweets'].apply(lambda x: np.mean(x.str.len())).reset_index(name='avg_tweet_length')
behavior_df = burstiness_df.merge(avg_len_df, on='User')
user_grouped = user_grouped.merge(behavior_df, on='User')

In [None]:
# Save
np.save(base_path / 'user_embeddings.npy', embedding_matrix)
user_grouped.drop(columns='embedding').to_csv(base_path / 'user_features.csv', index=False)
behavior_df.to_csv(base_path / 'user_behavior_features.csv', index=False)

In [None]:
import pandas as pd
import numpy as np

user_grouped = df.groupby('User')['Tweets'].apply(lambda x: ' '.join(x)).reset_index()

# avg_post_per_day
def active_days(dates):
    return len(set(dates.dt.date))
user_dates = df.groupby('User')['datex'].apply(list).reset_index()
user_dates['active_days'] = user_dates['datex'].apply(lambda d: active_days(pd.Series(d)))
user_dates['post_count'] = df.groupby('User').size().values
user_dates['avg_post_per_day'] = user_dates['post_count'] / user_dates['active_days']
user_grouped = user_grouped.merge(user_dates[['User', 'avg_post_per_day']], on='User')

# Burstiness & avg_len
def compute_burstiness(dates):
    if len(dates) < 2:
        return 0
    dates_sorted = sorted(dates)
    gaps = [(dates_sorted[i+1] - dates_sorted[i]).total_seconds() / 3600 for i in range(len(dates_sorted)-1)]
    return np.std(gaps)
burstiness_df = df.groupby('User')['datex'].apply(compute_burstiness).reset_index(name='burstiness')
avg_len_df = df.groupby('User')['Tweets'].apply(lambda x: np.mean(x.str.len())).reset_index(name='avg_tweet_length')
behavior_df = burstiness_df.merge(avg_len_df, on='User')
user_grouped = user_grouped.merge(behavior_df, on='User')

labels = df[['User', 'io_flag']].dropna().drop_duplicates()
labels = labels.groupby('User')['io_flag'].max().reset_index()
user_grouped = user_grouped.merge(labels, on='User')




In [None]:
user_grouped.to_csv(base_path / "user_features_labeled.csv", index=False)

### Training the Model

In [None]:
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

train = pd.read_csv(base_path / "user_features_labeled.csv")
X_train = train[['avg_post_per_day', 'burstiness', 'avg_tweet_length']]
y_train = train['io_flag']

model = RandomForestClassifier(class_weight={0:1, 1:20},random_state=42)
model.fit(X_train, y_train)


In [None]:
test = test.drop(columns=[col for col in test.columns if 'io_flag' in col])
eval_df = test.merge(true_labels, on='User')
print(eval_df.columns)

In [None]:
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score

auc = roc_auc_score(eval_df['io_flag'], eval_df['io_prob'])
eval_df['pred_label'] = (eval_df['io_prob'] >= 0.7).astype(int)
precision = precision_score(eval_df['io_flag'], eval_df['pred_label'])
recall = recall_score(eval_df['io_flag'], eval_df['pred_label'])
f1 = f1_score(eval_df['io_flag'], eval_df['pred_label'])

print(f"AUC: {auc:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

### Save the Model

In [None]:
import joblib
from pathlib import Path

# Define base path and model name
model_name = "user_model_v1"

# Full save path
save_path = base_path / model_name

# Save and reload model
joblib.dump(model, save_path)
model = joblib.load(save_path)

print(f"✅ Model and tokenizer saved to: {save_path}")
