In [None]:
import pandas as pd
import numpy as np


In [None]:
data = pd.read_csv('comments.csv')

In [None]:
data.head(1)

-EDA and Datacleaning :

In [None]:
data.drop(['CommentId'],axis=1,inplace=True)
data.drop(['VideoId'],axis=1,inplace=True)


In [None]:

data.info()

In [None]:
data.head(1)

-convert text to str format :

In [None]:
data['Text']=data['Text'].astype(str)

In [None]:
data.info()

-PrePreocessing :

In [None]:
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www.\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#\w+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    words = text.split()
    processed_words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(processed_words)
data['Text'] = data['Text'].apply(preprocess_text)


In [None]:

data['Text'].unique()

In [None]:
data.head(1)


In [None]:
toxicity_columns = [
    'IsToxic', 'IsAbusive', 'IsThreat', 'IsProvocative', 'IsObscene',
    'IsHatespeech', 'IsRacist', 'IsNationalist', 'IsSexist',
    'IsHomophobic', 'IsReligiousHate', 'IsRadicalism'
]
if all(col in data.columns for col in toxicity_columns):
    data['is_negative'] = data[toxicity_columns].any(axis=1).astype(int)
data.drop(toxicity_columns, axis=1, inplace=True)


In [None]:
data.info()

In [None]:
data['is_negative'].value_counts()
data.dropna(subset=['Text'], inplace=True)


-train test split :

In [None]:
from sklearn.model_selection import train_test_split
X=data['Text']
y=data['is_negative']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_train_vec = tfidf.fit_transform(X_train)
X_test_vec = tfidf.transform(X_test)


-applying regression :

In [None]:
from sklearn.linear_model import LogisticRegression

hyper parameter tuning using GridSearchCV :

In [None]:
param_grid = {
    'C': [0.1, 1,50,20,2,60, 10, 100],
    'penalty': ['l1', 'l2','elasticnet'], 
    'solver': ['saga']  
}

In [None]:
from sklearn.metrics import f1_score, make_scorer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
scorer = make_scorer(f1_score, pos_label=1)
grid_search = GridSearchCV(
    estimator=LogisticRegression(random_state=42, max_iter=1000),
    param_grid=param_grid,
    cv=5,
    scoring=scorer,
    verbose=2,
    n_jobs=-1
)

In [None]:
import warnings
warnings.filterwarnings("ignore")
grid_search.fit(X_train_vec, y_train)

In [None]:
best_params = grid_search.best_params_
best_score = grid_search.best_score_


-training the model with best params :

In [None]:

best_model = grid_search.best_estimator_

-predicting using the model :

In [None]:
y_pred = best_model.predict(X_test_vec)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score, make_scorer
tfidf = TfidfVectorizer(
    max_features=10000,
    ngram_range=(1, 2),
    stop_words='english',
    min_df=2
)

X_train_vec = tfidf.fit_transform(X_train)
X_test_vec = tfidf.transform(X_test)
param_grid1 = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 'log2']
}

scorer = make_scorer(f1_score, pos_label=1)

grid_search_rf = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid1,
    scoring=scorer,
    cv=5,
    verbose=2,
    n_jobs=-1
)

grid_search_rf.fit(X_train_vec, y_train)


best_rf = grid_search_rf.best_estimator_
y_pred_rf = best_rf.predict(X_test_vec)

accuracy_rf = accuracy_score(y_test, y_pred_rf)
report_rf = classification_report(y_test, y_pred_rf)
print(accuracy_rf)
print(report_rf)


-performance metrics :

In [None]:
from sklearn.metrics import accuracy_score,classification_report
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred) 
print(accuracy) 
print(report)


In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, f1_score, make_scorer
tfidf = TfidfVectorizer(
    max_features=10000,
    ngram_range=(1, 2),
    stop_words='english',
    min_df=2
)
X_train_vec = tfidf.fit_transform(X_train)
X_test_vec = tfidf.transform(X_test)
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [4, 6, 8],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

scorer = make_scorer(f1_score, pos_label=1)
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
grid_search_xgb = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    scoring=scorer,
    cv=5,
    verbose=2,
    n_jobs=-1
)
grid_search_xgb.fit(X_train_vec, y_train)
best_xgb = grid_search_xgb.best_estimator_
y_pred_xgb = best_xgb.predict(X_test_vec)

accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
report_xgb = classification_report(y_test, y_pred_xgb)
print("accuracy score for xgb : ",accuracy_xgb)
print("classification report for xgb : ",report_xgb)
