In [5]:
import sklearn
print(sklearn.__version__)


1.5.2


In [6]:
import pandas as pd
train = pd.read_csv('train.csv')
print("Training Set:"% train.columns, train.shape, len(train))
test = pd.read_csv('test.csv')
print("Test Set:"% test.columns, test.shape, len(test))

Training Set: (31962, 3) 31962
Test Set: (17197, 2) 17197


In [7]:
import re
def  clean_text(df, text_field):
    df[text_field] = df[text_field].str.lower()
    df[text_field] = df[text_field].apply(lambda elem: re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", elem))  
    return df
test_clean = clean_text(test, "tweet")
train_clean = clean_text(train, "tweet")

In [8]:
from sklearn.utils import resample
train_majority = train_clean[train_clean.label==0]
train_minority = train_clean[train_clean.label==1]
train_minority_upsampled = resample(train_minority,replace=True,n_samples=len(train_majority),random_state=123)
train_upsampled = pd.concat([train_minority_upsampled, train_majority])
train_upsampled['label'].value_counts()

label
1    29720
0    29720
Name: count, dtype: int64

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
pipeline_sgd = Pipeline([('vect', CountVectorizer()),('tfidf',  TfidfTransformer()),('nb', SGDClassifier()),])

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_upsampled['tweet'],
                                                    train_upsampled['label'],random_state = 0)

In [11]:
model = pipeline_sgd.fit(X_train, y_train)
y_predict = model.predict(X_test)
from sklearn.metrics import f1_score
f1_score(y_test, y_predict)

0.9695353643090461

In [32]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'vect__max_df': [0.75, 1.0],             
    'vect__ngram_range': [(1, 1), (1, 2)],    
    'tfidf__use_idf': [True, False],          
    'nb__alpha': [0.0001, 0.001, 0.01],      
    'nb__max_iter': [1000, 2000],             
    'nb__penalty': ['l2', 'elasticnet'],}

In [33]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
pipeline_sgd = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),('nb', SGDClassifier())])
grid_search = GridSearchCV(pipeline_sgd, param_grid, scoring='f1', cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

In [34]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_upsampled['tweet'], train_upsampled['label'], random_state=0)

In [35]:
print("Best parameters:", grid_search.best_params_)
best_model = grid_search.best_estimator_


Best parameters: {'nb__alpha': 0.0001, 'nb__max_iter': 1000, 'nb__penalty': 'l2', 'tfidf__use_idf': True, 'vect__max_df': 0.75, 'vect__ngram_range': (1, 2)}


In [None]:
print("Starting grid search...")
grid_search.fit(X_train, y_train)
print("Grid search completed.")
print("Best parameters:", grid_search.best_params_)
best_model = grid_search.best_estimator_


Starting grid search...


In [None]:
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, classification_report
y_pred = best_model.predict(X_test)
print("F1 Score:", f1_score(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

In [None]:
vectorizer = pipeline_sgd.named_steps['vect']  
classifier = pipeline_sgd.named_steps['nb']  
feature_names = vectorizer.get_feature_names_out()
weights = classifier.coef_[0]  
feature_importance = sorted(zip(feature_names, weights), key=lambda x: abs(x[1]), reverse=True)
print("Top 20 important features for detecting hate speech:")
for feature, weight in feature_importance[:20]:
    print(f"{feature}: {weight}")
