In [2]:
!pip install -q pandas numpy scikit-learn matplotlib seaborn nltk joblib

In [3]:
import os, zipfile
import pandas as pd

zip_path = r"C:\Users\hplap\Downloads\test.csv.zip"   # <-- your path (raw string avoids backslash issues)
assert os.path.exists(zip_path), f"File not found: {zip_path}"

# Open the zip and list files
with zipfile.ZipFile(zip_path, 'r') as z:
    print("Files in zip:", z.namelist())
    csv_files = [n for n in z.namelist() if n.lower().endswith('.csv')]
    if len(csv_files) == 0:
        raise RuntimeError("No CSV found inside zip.")
    csv_name = csv_files[0]
    print("Reading:", csv_name)
    df = pd.read_csv(z.open(csv_name))

print("Loaded. Shape:", df.shape)
df.head()


Files in zip: ['test.csv']
Reading: test.csv
Loaded. Shape: (17197, 2)


Unnamed: 0,id,tweet
0,31963,#studiolife #aislife #requires #passion #dedic...
1,31964,@user #white #supremacists want everyone to s...
2,31965,safe ways to heal your #acne!! #altwaystohe...
3,31966,is the hp and the cursed child book up for res...
4,31967,"3rd #bihday to my amazing, hilarious #nephew..."


In [5]:
# Quick inspection
print("Columns:", df.columns.tolist())
print("\nDtypes:\n", df.dtypes)

# Define obj_cols once at the top so it exists everywhere
obj_cols = df.select_dtypes(include=['object']).columns.tolist()

# Attempt to auto-detect text column
candidates_text = [c for c in df.columns if c.lower() in ('tweet','text','content','message','comment','post')]
if candidates_text:
    text_col = candidates_text[0]
else:
    if len(obj_cols) == 0:
        raise RuntimeError("No text-like columns found.")
    text_col = max(obj_cols, key=lambda c: df[c].astype(str).str.len().mean())

# Attempt to auto-detect label column
candidates_label = [c for c in df.columns if c.lower() in ('class','label','target','hate','category','annotation')]
if candidates_label:
    label_col = candidates_label[0]
else:
    non_obj = [c for c in df.columns if c not in obj_cols]
    label_col = non_obj[0] if non_obj else df.columns[-1]

print(f"\nUsing text column: '{text_col}'")
print(f"Using label column: '{label_col}'")
print("\nValue counts for label column:\n", df[label_col].value_counts(dropna=False))


Columns: ['id', 'tweet']

Dtypes:
 id        int64
tweet    object
dtype: object

Using text column: 'tweet'
Using label column: 'id'

Value counts for label column:
 id
49159    1
31963    1
31964    1
31965    1
31966    1
        ..
31979    1
31980    1
31981    1
31982    1
31983    1
Name: count, Length: 17197, dtype: int64


In [6]:
# Show label distribution and some sample tweets
display(df[[text_col, label_col]].sample(6, random_state=42))
print("\nLabel counts:\n", df[label_col].value_counts(normalize=False))

Unnamed: 0,tweet,id
11123,father's day hppy father's day dad luh yah a...,43086
10381,@user tooo much for @user can't wait for e...,42344
3053,#grillsquare wishes you father's day,35016
6504,@user @user like colorados aboion clinic shoot...,38467
3501,with my baby... #instalove #mamadeniÃ±a #madre...,35464
1315,starbucks makes my macchiato a small latte &gt...,33278



Label counts:
 id
49159    1
31963    1
31964    1
31965    1
31966    1
        ..
31979    1
31980    1
31981    1
31982    1
31983    1
Name: count, Length: 17197, dtype: int64


In [8]:
import re
import string
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'http\S+', '', text)           # remove URLs
    text = re.sub(r'@\w+', '', text)              # remove mentions
    text = re.sub(r'#\w+', '', text)              # remove hashtags
    text = re.sub(r'\d+', '', text)               # remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()      # remove extra spaces
    tokens = [word for word in text.split() if word not in stop_words]
    return " ".join(tokens)

# Apply cleaning
df['clean_text'] = df[text_col].astype(str).apply(clean_text)
df[['clean_text']].head()


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hplap\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,clean_text
0,find ¦
1,want everyone see new â  â hereâs
2,safe ways heal
3,hp cursed child book reservations already yes ...
4,rd amazing hilarious eli ahmir uncle dave love...


In [11]:
from sklearn.model_selection import train_test_split

X = df['clean_text']
y = df[label_col].astype(str)   # labels as strings

# check class distribution
print("Class distribution:\n", y.value_counts())

# if some classes have < 2 samples, we skip stratify
if y.value_counts().min() < 2:
    print("\n⚠️ Some classes have fewer than 2 samples → disabling stratify")
    stratify_option = None
else:
    stratify_option = y

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=stratify_option
)

print("Train size:", X_train.shape[0], " Test size:", X_test.shape[0])
print("\nLabel distribution in train:\n", y_train.value_counts())


Class distribution:
 id
49159    1
31963    1
31964    1
31965    1
31966    1
        ..
31979    1
31980    1
31981    1
31982    1
31983    1
Name: count, Length: 17197, dtype: int64

⚠️ Some classes have fewer than 2 samples → disabling stratify
Train size: 13757  Test size: 3440

Label distribution in train:
 id
47758    1
47493    1
37240    1
36076    1
38448    1
        ..
35262    1
31983    1
42604    1
42704    1
42275    1
Name: count, Length: 13757, dtype: int64


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

# turn text into numerical features
vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1,2))
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

print("TF-IDF matrix shape (train):", X_train_vec.shape)

TF-IDF matrix shape (train): (13757, 10000)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

# ✅ reduce TF-IDF features to avoid memory explosion
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# ✅ Logistic Regression (smaller, faster solver)
logreg = LogisticRegression(
    max_iter=500,
    class_weight='balanced',
    solver='saga',   # memory efficient
    n_jobs=-1        # use all cores
)
logreg.fit(X_train_vec, y_train)

# ✅ Naive Bayes (already lightweight)
nb = MultinomialNB()
nb.fit(X_train_vec, y_train)

print("✅ Models trained successfully!")


In [16]:
from sklearn.metrics import classification_report, accuracy_score

# Logistic Regression
y_pred_log = logreg.predict(X_test_vec)
print("\n📊 Logistic Regression Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_log))
print(classification_report(y_test, y_pred_log))

# Naive Bayes
y_pred_nb = nb.predict(X_test_vec)
print("\n📊 Naive Bayes Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb))


📊 Logistic Regression Results:
Accuracy: 0.0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

       31963       0.00      0.00      0.00       1.0
       31966       0.00      0.00      0.00       1.0
       31968       0.00      0.00      0.00       1.0
       31971       0.00      0.00      0.00       1.0
       31977       0.00      0.00      0.00       1.0
       31978       0.00      0.00      0.00       0.0
       31979       0.00      0.00      0.00       0.0
       31981       0.00      0.00      0.00       0.0
       31982       0.00      0.00      0.00       1.0
       31988       0.00      0.00      0.00       0.0
       31989       0.00      0.00      0.00       0.0
       31990       0.00      0.00      0.00       1.0
       31993       0.00      0.00      0.00       1.0
       31994       0.00      0.00      0.00       1.0
       31996       0.00      0.00      0.00       1.0
       31998       0.00      0.00      0.00       1.0
       32001       0.00      0.00      0.00       0.0
       32002       0.00    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

       31963       0.00      0.00      0.00       1.0
       31964       0.00      0.00      0.00       0.0
       31965       0.00      0.00      0.00       0.0
       31966       0.00      0.00      0.00       1.0
       31968       0.00      0.00      0.00       1.0
       31971       0.00      0.00      0.00       1.0
       31977       0.00      0.00      0.00       1.0
       31978       0.00      0.00      0.00       0.0
       31979       0.00      0.00      0.00       0.0
       31981       0.00      0.00      0.00       0.0
       31982       0.00      0.00      0.00       1.0
       31988       0.00      0.00      0.00       0.0
       31989       0.00      0.00      0.00       0.0
       31990       0.00      0.00      0.00       1.0
       31993       0.00      0.00      0.00       1.0
       31994       0.00      0.00      0.00       1.0
       31995       0.00      0.00      0.00       0.0
       31996       0.00    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [17]:
import joblib

joblib.dump(vectorizer, 'tfidf_vectorizer.joblib')
joblib.dump(logreg, 'logreg_model.joblib')
joblib.dump(nb, 'naivebayes_model.joblib')

print("Models and vectorizer saved successfully!")

Models and vectorizer saved successfully!


In [20]:
def predict_text(text, model=logreg):
    t = clean_text(text)
    v = vectorizer.transform([t])
    pred = model.predict(v)[0]
    prob = None
    if hasattr(model, "predict_proba"):
        prob = model.predict_proba(v).max()
    return pred, prob

samples = [
    "I hate you and your family",
    "This is the best day ever!",
    "You're such an idiot!"
]

for s in samples:
    print(s, "->", predict_text(s))

I hate you and your family -> ('45435', np.float64(0.00014842163501683613))
This is the best day ever! -> ('45714', np.float64(0.00014101326723574907))
You're such an idiot! -> ('39138', np.float64(0.00013086076370107702))
