In [5]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Load the dataset
df = pd.read_csv('GSoc CALEL Test-20240320T121822Z-001/GSoc CALEL Test/cyberbullying/aggression_parsed_dataset.csv')

# Data preprocessing - Text cleaning
def clean_text(text):
    # Remove special characters, punctuation, and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert text to lowercase
    text = text.lower()
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Join tokens back into a single string
    cleaned_text = ' '.join(tokens)
    return cleaned_text

# Apply text cleaning function to 'Text' column
df['clean_text'] = df['Text'].apply(clean_text)

# Split the data into training and testing sets
X = df['clean_text'].values
y = df['ed_label_1'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the preprocessed text data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Initialize Logistic Regression model
logreg_model = LogisticRegression()

# Train the model
logreg_model.fit(X_train_tfidf, y_train)

# Make predictions on the testing data
y_pred = logreg_model.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Print evaluation metrics
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_rep)

# Optional: Analyze feature coefficients
feature_coeffs = logreg_model.coef_[0]
feature_names = tfidf_vectorizer.get_feature_names()
feature_importance = dict(zip(feature_names, feature_coeffs))
sorted_features = sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)
top_n = 10
for feature, coef in sorted_features[:top_n]:
    print(f"Feature: {feature}, Coefficient: {coef}")

# Optional: Analyze misclassified instances
misclassified_indices = [i for i, (true, pred) in enumerate(zip(y_test, y_pred)) if true != pred]
for idx in misclassified_indices:
    instance_feature_vector = X_test_tfidf[idx].reshape(1, -1)
    predicted_proba = logreg_model.predict_proba(instance_feature_vector)
    print(f"Instance: {X_test[idx]}")
    print(f"True Label: {y_test[idx]}, Predicted Label: {y_pred[idx]}")
    print(f"Predicted Probabilities: {predicted_proba}")
    print("---------")


ValueError: Unknown label type: 'continuous'