In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the datasets
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
test_labels_df = pd.read_csv('test_labels.csv')
sub_df = pd.read_csv('sample_submission.csv')

In [None]:
train_df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [None]:
test_df.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [None]:
test_labels_df.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,-1,-1,-1,-1,-1,-1
1,0000247867823ef7,-1,-1,-1,-1,-1,-1
2,00013b17ad220c46,-1,-1,-1,-1,-1,-1
3,00017563c3f7919a,-1,-1,-1,-1,-1,-1
4,00017695ad8997eb,-1,-1,-1,-1,-1,-1


In [None]:
sub_df.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.5,0.5,0.5,0.5,0.5,0.5
1,0000247867823ef7,0.5,0.5,0.5,0.5,0.5,0.5
2,00013b17ad220c46,0.5,0.5,0.5,0.5,0.5,0.5
3,00017563c3f7919a,0.5,0.5,0.5,0.5,0.5,0.5
4,00017695ad8997eb,0.5,0.5,0.5,0.5,0.5,0.5


In [None]:
# Preprocess the text data (e.g., cleaning, tokenization, padding)
# Here, we will use simple text cleaning for illustration
import re
def clean_text(text):
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'\[\[User.*\]', '', text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'\W', ' ', text)
    text = text.lower()
    return text

train_df['comment_text'] = train_df['comment_text'].apply(clean_text)
test_df['comment_text'] = test_df['comment_text'].apply(clean_text)

# Split the data into training and validation sets
X = train_df['comment_text']
y = train_df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert text data into TF-IDF features
vectorizer = TfidfVectorizer(max_features=10000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)
X_test_tfidf = vectorizer.transform(test_df['comment_text'])


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import roc_auc_score
import numpy as np

# Define the logistic regression model with L2 regularization
log_reg = LogisticRegression(max_iter=1000)

# Define the parameter grid for GridSearchCV
param_grid = {'estimator__C': [0.01, 0.1, 1, 10, 100]}

# Create the MultiOutputClassifier
multi_target_model = MultiOutputClassifier(log_reg, n_jobs=-1)

# Perform GridSearchCV to find the best regularization parameter
grid_search = GridSearchCV(multi_target_model, param_grid, cv=3, scoring='roc_auc', verbose=2)
grid_search.fit(X_train_tfidf, y_train)

# Get the best estimator
best_model = grid_search.best_estimator_

# Predict probabilities for the validation data
y_val_pred = best_model.predict_proba(X_val_tfidf)

# Convert prediction to the correct format (list of arrays to a single array)
y_val_pred = np.array([pred[:, 1] for pred in y_val_pred]).T

# Evaluate the model using ROC AUC
roc_auc = roc_auc_score(y_val, y_val_pred, average='macro')
print(f'Validation ROC AUC: {roc_auc:.4f}')



Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV] END ..................................estimator__C=0.01; total time=   5.1s
[CV] END ..................................estimator__C=0.01; total time=   3.8s
[CV] END ..................................estimator__C=0.01; total time=   2.7s
[CV] END ...................................estimator__C=0.1; total time=   3.8s
[CV] END ...................................estimator__C=0.1; total time=   4.0s
[CV] END ...................................estimator__C=0.1; total time=   5.4s
[CV] END .....................................estimator__C=1; total time=   7.0s
[CV] END .....................................estimator__C=1; total time=  10.7s
[CV] END .....................................estimator__C=1; total time=   7.9s
[CV] END ....................................estimator__C=10; total time=  17.7s
[CV] END ....................................estimator__C=10; total time=  17.8s
[CV] END ....................................esti

In [None]:
# Predict probabilities for the test data
test_pred = model.predict_proba(X_test_tfidf)
test_pred = np.array([pred[:,1] for pred in test_pred]).T

# Prepare the submission file
submission_df = pd.read_csv('sample_submission.csv')
submission_df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']] = test_pred
submission_df.to_csv('submission.csv', index=False)


In [None]:
submission_df.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.998644,0.203019,0.994084,0.06723,0.966773,0.294976
1,0000247867823ef7,0.007647,0.001236,0.004206,0.00052,0.005492,0.002802
2,00013b17ad220c46,0.012258,0.001603,0.009265,0.000791,0.008836,0.002186
3,00017563c3f7919a,0.003802,0.002146,0.003495,0.001001,0.003669,0.001328
4,00017695ad8997eb,0.031565,0.003545,0.01059,0.001718,0.010717,0.003229
