# Importing data

In [None]:
import pandas as pd
import re
import nltk
!pip install contractions
import contractions
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('wordnet')
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from scipy.sparse import hstack
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt



[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
from google.colab import drive
drive.mount('/content/drive')
df_train = pd.read_csv("/content/drive/MyDrive/mis_assessment/train3.csv")
df_test = pd.read_csv("/content/drive/MyDrive/mis_assessment/test3.csv")

df_train = df_train.rename(columns = {'Text': 'text'})
df_test = df_test.rename(columns = {'Text': 'text'})

print(df_train.head())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
                                                text  category
0  when modi promised “minimum government maximum...      -1.0
1  talk all the nonsense and continue all the dra...       0.0
2  what did just say vote for modi  welcome bjp t...       1.0
3  asking his supporters prefix chowkidar their n...       1.0
4  answer who among these the most powerful world...       1.0


# Preprocessing and EDA

## Nan Values and Duplicate Values Removal

In [None]:
print(f"Nan Values before removal {df_train.isna().sum().sum()}, Duplicate Values before removal {df_train.duplicated().sum()}")
df_train = df_train.dropna()
df_train = df_train.drop_duplicates(subset='text').reset_index(drop=True)
print(f"Nan Values before removal {df_train.isna().sum().sum()}, Duplicate Values after removal {df_train['text'].duplicated().sum()})")

Nan Values before removal 17397, Duplicate Values before removal 8696
Nan Values before removal 0, Duplicate Values after removal 0)


In [None]:
df_train.value_counts('category')

Unnamed: 0_level_0,count
category,Unnamed: 1_level_1
1.0,68228
0.0,52317
-1.0,33732


## URL Removal

In [None]:
url_count = 0

regex_tokenizer = RegexpTokenizer('/', gaps = True)
def join_and_split(words):
   return ((" ").join(words)).split()

df_train['text'] = df_train['text'].apply(regex_tokenizer.tokenize)
df_train['text'] = df_train['text'].apply(join_and_split)
# df_test['text'] = df_test['text'].apply(regex_tokenizer.tokenize)
# df_test['text'] = df_test['text'].apply(join_and_split)

def get_words_from_urls(words):
    global url_count
    clean_sentence = ""
    for word in words:
        if word[:4] == 'http' or word[:3] == 'www':
          url_count += 1
          continue
        clean_sentence += word + " "
    return clean_sentence

df_train['text'] = df_train['text'].apply(get_words_from_urls)
# df_test['text'] = df_test['text'].apply(get_words_from_urls)
print(url_count)

31


## Expanding contractions

In [None]:
df_train['text'] = df_train['text'].apply(contractions.fix)
# df_test['text'] = df_test['text'].apply(contractions.fix)

## Lemmatization

In [None]:
nltk.download('punkt_tab')

lemmatizer = WordNetLemmatizer()

# we do not convert the sentences to lowercase, because lowercase and uppercase words might have different connotations
def lemmatize_sentence(tokenized_list):
    lemmatized_list = [lemmatizer.lemmatize(word) if len(word) > 4 else word for word in tokenized_list]
    lemmatized_s = ' '.join([str(word) for word in lemmatized_list])
    return lemmatized_s

df_train['text'] = df_train['text'].apply(word_tokenize)
df_train['text'] = df_train['text'].apply(lemmatize_sentence)

# df_test['text'] = df_test['text'].apply(word_tokenize)
# df_test['text'] = df_test['text'].apply(lemmatize_sentence)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


## Removing Punctuation

In [None]:
df_train['text'] = df_train['text'].str.strip()
df_train['text'] = df_train['text'].apply(lambda x : re.sub('[^a-zA-Z ?!]+', '', x))
df_train['text'] = df_train['text'].str.replace(r'\s+', ' ', regex=True)
# df_test['text'] = df_test['text'].apply(lambda x : re.sub('[^a-zA-Z ?!]+', '', x))

## Removing Duplicates

In [None]:
df_train = df_train.drop_duplicates(subset='text').reset_index(drop=True)

## Vectorization

In [None]:
train_comments = df_train["text"]
# test_comments = df_test["text"]

all_comments = pd.concat([train_comments])

all_comments.shape
all_comments.head()

Unnamed: 0,text
0,when modi promised minimum government maximum ...
1,talk all the nonsense and continue all the dra...
2,what did just say vote for modi welcome bjp to...
3,asking his supporter prefix chowkidar their na...
4,answer who among these the most powerful world...


In [None]:
mf = 30000

vectorizer = TfidfVectorizer(sublinear_tf = True,
                             strip_accents = 'unicode',
                             analyzer = 'word',
                             token_pattern = '(?u)\\b\\w\\w+\\b\\w{,1}',
                             lowercase = True,
                             stop_words = 'english',
                             ngram_range = (1, 3),
                             min_df = 2,
                             max_df = 0.6,
                             norm = 'l2',
                             max_features = int((3/5)*mf)
                             )
vectorizer.fit(all_comments)
train_word_features = vectorizer.transform(train_comments)
# test_word_features = vectorizer.transform(test_comments)

char_vectorizer = TfidfVectorizer (sublinear_tf = True,
                                   strip_accents = 'unicode',
                                   analyzer = 'char',
                                   ngram_range = (2, 6),
                                   min_df = 2,
                                   max_df = 0.6,
                                   max_features = int((2/5)*mf)
                                   )
char_vectorizer.fit(all_comments)
train_char_features = char_vectorizer.transform(train_comments)
# test_char_features = char_vectorizer.transform(test_comments)

train_features = hstack([train_word_features, train_char_features])
# test_features = hstack([test_word_features, test_char_features])

## Feature Engineering and One Hot Encoding

In [None]:
df_train['word_count'] = df_train['text'].apply(lambda x: len(str(x).split(" ")))

# Train Test Split

In [None]:
from scipy.sparse import csr_matrix, hstack

word_count_array = df_train['word_count'].to_numpy()
word_count_array = word_count_array.reshape(-1, 1)
word_count_sparse = csr_matrix(word_count_array)
X = hstack([train_features, word_count_sparse])
# X = train_features

y = df_train['category']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=3)
X_train

<121124x30001 sparse matrix of type '<class 'numpy.float64'>'
	with 40775819 stored elements in Compressed Sparse Row format>

# Training

## Ridge Classifier

In [None]:
from sklearn.linear_model import RidgeClassifier
clf = RidgeClassifier().fit(X_train, y_train)

print(f"Training Accuracy : {clf.score(X_train, y_train)}")
print(f"Validation Accuracy : {clf.score(X_test, y_test)}")

Training Accuracy : 0.9428271853637594
Validation Accuracy : 0.8769525444998514


In [None]:
# from sklearn.naive_bayes import GaussianNB
# clf = GaussianNB()
# y_pred = clf.fit(X_train.toarray(), y_train)

# print(f"Training Accuracy : {clf.score(X_train.toarray(), y_train)}")
# print(f"Validation Accuracy : {clf.score(X_test.toarray(), y_test)}")

In [None]:
# replace -1 with 2

y_train[y_train==-1] = 2
y_test[y_test==-1] = 2

In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

params = {
    "objective": "multi:softprob",
    "num_class": 3,
    "tree_method": "hist",
    "device": "cuda",
    'eta': 0.24172217809311938,
    'max_depth': 13,
    'min_child_weight': 1.204125740865512,
    'subsample': 0.9658273955957696,
    'colsample_bytree': 0.7733759101125599,
    'gamma': 1.788039740353529,
    'lambda': 1.2227284430176681,
    'alpha': 0.1801118342336058,
    'n_estimators': 453,
    'colsample_bylevel': 0.7108990821059266,
    'max_delta_step': 1,
    'grow_policy': 'lossguide'
}

model = xgb.XGBClassifier(**params)
model.fit(X_train, y_train, verbose=False)

preds = model.predict_proba(X_test)
auc = roc_auc_score(y_test, preds, multi_class="ovr")

In [None]:
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    classification_report,
    log_loss,
    confusion_matrix,
    matthews_corrcoef
)

y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)

metrics = {}
metrics["Accuracy"] = accuracy_score(y_test, y_pred)
metrics["Precision (Macro)"] = precision_score(y_test, y_pred, average="macro")
metrics["Precision (Weighted)"] = precision_score(y_test, y_pred, average="weighted")
metrics["Recall (Macro)"] = recall_score(y_test, y_pred, average="macro")
metrics["Recall (Weighted)"] = recall_score(y_test, y_pred, average="weighted")
metrics["F1 Score (Macro)"] = f1_score(y_test, y_pred, average="macro")
metrics["F1 Score (Weighted)"] = f1_score(y_test, y_pred, average="weighted")
metrics["ROC-AUC (OVR)"] = roc_auc_score(y_test, y_pred_proba, multi_class="ovr")
metrics["ROC-AUC (OVO)"] = roc_auc_score(y_test, y_pred_proba, multi_class="ovo")
metrics["Log Loss"] = log_loss(y_test, y_pred_proba)
metrics["Matthews Correlation Coefficient"] = matthews_corrcoef(y_test, y_pred)

print("Metrics Summary:")
for metric, value in metrics.items():
    print(f"{metric}: {value:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


## Bayes Optimisation

In [None]:
# !pip install joblib
# import joblib
# !pip install optuna
# import optuna
# import xgboost as xgb
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import roc_auc_score

# def objective(trial):
#     # Define hyperparameter search space
#     params = {
#         "objective": "multi:softprob",
#         "num_class": 3,
#         "tree_method": "hist",
#         "device": "cuda",
#         "learning_rate": trial.suggest_float("eta", 0.2, 0.5),
#         "max_depth": trial.suggest_int("max_depth", 3, 15),
#         "min_child_weight": trial.suggest_float("min_child_weight", 1, 10),
#         "subsample": trial.suggest_float("subsample", 0.5, 1.0),
#         "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
#         "gamma": trial.suggest_float("gamma", 0, 5),
#         "lambda": trial.suggest_float("lambda", 1, 5),
#         "alpha": trial.suggest_float("alpha", 0, 5),
#         "n_estimators": trial.suggest_int("n_estimators", 50, 500),

#         "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.5, 1.0),
#         "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.5, 1.0),
#         "max_delta_step": trial.suggest_int("max_delta_step", 0, 10),
#         "sampling_method": trial.suggest_categorical("sampling_method", ["uniform", "gradient_based"]),
#         "grow_policy": trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"]),
#         }

#     # Train model
#     model = xgb.XGBClassifier(**params)
#     model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)

#     # Evaluate model
#     preds = model.predict_proba(X_test)
#     auc = roc_auc_score(y_test, preds, multi_class="ovr")
#     return auc

# # study = optuna.create_study(direction="maximize")
# study = joblib.load(f"/content/drive/MyDrive/mis_assessment/xgb_optuna_study_batch_auc.pkl")
# def save_study_callback(study, trial):
#     joblib.dump(study, f"/content/drive/MyDrive/mis_assessment/xgb_optuna_study_batch_auc.pkl")
#     print("saved study")
# study.optimize(objective, n_trials=80, callbacks=[save_study_callback])

# print("Best hyperparameters:", study.best_params)
