In [None]:
!pip install -r requirements.txt

In [None]:
%run data_preprocessing.ipynb

In [None]:
tweets = pd.read_csv('tweets_prepro.csv')
labels = pd.read_csv('corpus_SexistContent.csv', sep='\t', header=None, names=['tweet_id', 'label'])
df = pd.merge(tweets, labels, on = 'tweet_id')

In [None]:
stopwords = [x.strip() for x in open('stop_word_fr.txt').readlines()]

def remove_stopwords(text, stopwords=stopwords):
    return ' '.join([word for word in text.split() if word not in stopwords])

df['text_clean'] = df['text_clean'].apply(remove_stopwords)

In [None]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(df['text_clean'],df['label'],test_size=0.2)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

# BoW 
vectorizer = CountVectorizer(max_features=1000, ngram_range=(1, 3))
X_train_counts = vectorizer.fit_transform(X_train)
X_test_counts = vectorizer.transform(X_test)

# TF-IDF
tf_transformer = TfidfTransformer()
X_train_tf = tf_transformer.fit_transform(X_train_counts)
X_test_tf = tf_transformer.transform(X_test_counts)

# SVM model
clf = LinearSVC(C=0.1, class_weight='balanced')
clf.fit(X_train_tf, y_train)
y_pred = clf.predict(X_test_tf)


print(classification_report(y_test, y_pred))
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt


tsne = TSNE(n_components=2, perplexity=30, random_state=42)
X_embedded = tsne.fit_transform(X_train_tf.toarray())


plt.figure(figsize=(10, 7))
plt.scatter(X_embedded[:, 0], X_embedded[:, 1], c=y_train, cmap='coolwarm', alpha=0.6)
plt.title("t-SNE projection of TF-IDF vectors")
plt.xlabel("t-SNE dim 1")
plt.ylabel("t-SNE dim 2")
plt.colorbar(label='Label')
plt.savefig("tsne_projection.png", dpi=300, bbox_inches='tight')
plt.show()