In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import warnings

In [2]:
warnings.filterwarnings('ignore')

In [3]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to C:\Users\SRISHTI
[nltk_data]     BULLA\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\SRISHTI
[nltk_data]     BULLA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
stemmer = SnowballStemmer("english")
stopword = set(stopwords.words("english"))

In [5]:
data = pd.read_csv("twitter_data.csv")
data['labels'] = data['class'].map({0: "speech", 1: "Offensive", 2: "Not offensive"})
data = data[["tweet", "labels"]]

In [6]:
def tokenize_stem(text):
    tokens = nltk.word_tokenize(text)
    tokens = [word for word in tokens if word not in stopword]
    tokens = [word for word in tokens if word.isalpha()]
    tokens = [stemmer.stem(word) for word in tokens]
    return tokens


In [7]:
vectorizer = CountVectorizer(tokenizer=tokenize_stem)
X = vectorizer.fit_transform(data['tweet'])
X_train, X_test, y_train, y_test = train_test_split(X, data['labels'], test_size=0.2, random_state=42)

In [9]:
# Using Random Forest classifier
classifier = RandomForestClassifier(n_estimators=100, random_state=42)
classifier.fit(X_train, y_train)


In [10]:
# Predicting on test set
y_pred = classifier.predict(X_test)

In [11]:
# Calculating F1 score
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"F1 Score: {f1}")

F1 Score: 0.8878741111580633


In [12]:
# Example predictions
tweet1 = input("Enter your tweet: ")
tweet_vector1 = vectorizer.transform([tweet1])
prediction1 = classifier.predict(tweet_vector1)
print(f"The tweet is classified as: {prediction1[0]}")


Enter your tweet: Hi bitch
The tweet is classified as: Offensive


In [14]:
# Example predictions
tweet1 = input("Enter your tweet: ")
tweet_vector1 = vectorizer.transform([tweet1])
prediction1 = classifier.predict(tweet_vector1)
print(f"The tweet is classified as: {prediction1[0]}")


Enter your tweet: Pray with me
The tweet is classified as: Not offensive


In [15]:
# Example predictions
tweet1 = input("Enter your tweet: ")
tweet_vector1 = vectorizer.transform([tweet1])
prediction1 = classifier.predict(tweet_vector1)
print(f"The tweet is classified as: {prediction1[0]}")


Enter your tweet: I hate the way you spoke ... I could have made a better person than you
The tweet is classified as: Offensive
