In [2]:
import pandas as pd

# Load your dataset
df = pd.read_csv("/content/twitter_sexism_parsed_dataset.csv")

# Show first few rows
df.head()

Unnamed: 0,index,id,Text,Annotation,oh_label
0,5.35198627292254e+17,5.35198627292254e+17,RT @BeepsS: @senna1 @BeepsS: I'm not sexist bu...,sexism,1.0
1,5.75984924030714e+17,5.75984924030714e+17,There's some very hate able teams this year #MKR,none,0.0
2,5.7233536016588e+17,5.7233536016588e+17,"RT @The_Eccles: ""Everyone underestimated us"" \...",none,0.0
3,5.72337925708374e+17,5.72337925708374e+17,RT @NOTLukeDarcy: did @Channel7 or #MKR actual...,none,0.0
4,4.43033024528011e+17,4.43033024528011e+17,"No, you don't. @Shut_Up_Jeff: I thought of a r...",sexism,1.0


In [3]:
print(df.columns)

Index(['index', 'id', 'Text', 'Annotation', 'oh_label'], dtype='object')


In [8]:
df = pd.read_csv('twitter_sexism_parsed_dataset.csv')
df = df[['Text', 'Annotation']]
df.head()

Unnamed: 0,Text,Annotation
0,RT @BeepsS: @senna1 @BeepsS: I'm not sexist bu...,sexism
1,There's some very hate able teams this year #MKR,none
2,"RT @The_Eccles: ""Everyone underestimated us"" \...",none
3,RT @NOTLukeDarcy: did @Channel7 or #MKR actual...,none
4,"No, you don't. @Shut_Up_Jeff: I thought of a r...",sexism


In [9]:
df_sample = df.sample(n=1000, random_state=42).copy()
df_sample.reset_index(drop=True, inplace=True)

In [10]:
import re

def clean_text(text):
    text = re.sub(r"http\S+|www\S+|https\S+", '', str(text))  # Remove URLs
    text = re.sub(r'\@[\w]*', '', text)  # Remove mentions
    text = re.sub(r'[^A-Za-z\s]', '', text)  # Remove punctuation and digits
    text = text.lower().strip()
    return text

df_sample['cleaned_text'] = df_sample['Text'].apply(clean_text)
df_sample[['Text', 'cleaned_text']].head()

Unnamed: 0,Text,cleaned_text
0,I want Sheri &amp; Emilie to have their own sh...,i want sheri amp emilie to have their own show...
1,RT @kennedy_jordan How can feminists say they ...,rt how can feminists say they want equality w...
2,@JihadiA8 There was no former glory. Just a b...,there was no former glory just a backward sta...
3,"That's funny, most House Republicans are men. ...",thats funny most house republicans are men im...
4,@Bohagan81 Nope!,nope


In [11]:
df_sample['label'] = df_sample['Annotation'].map({'sexism': 1, 'none': 0})
df_sample[['Annotation', 'label']].head()

Unnamed: 0,Annotation,label
0,none,0
1,sexism,1
2,none,0
3,sexism,1
4,none,0


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Limit number of features to avoid lag
vectorizer = TfidfVectorizer(max_features=3000, ngram_range=(1,2))

X = vectorizer.fit_transform(df_sample['cleaned_text'])
y = df_sample['label']

In [14]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
model.fit(X_train, y_train)

In [16]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = model.predict(X_test)

print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.99      0.89       159
           1       0.80      0.10      0.17        41

    accuracy                           0.81       200
   macro avg       0.81      0.55      0.53       200
weighted avg       0.81      0.81      0.75       200

Confusion Matrix:
 [[158   1]
 [ 37   4]]


In [17]:
def predict_text(text):
    cleaned = clean_text(text)
    vectorized = vectorizer.transform([cleaned])
    prediction = model.predict(vectorized)[0]
    label = 'sexism' if prediction == 1 else 'none'
    return label

# üîç Try it
sample = "Women are not good at driving."
print("Prediction:", predict_text(sample))

Prediction: none
