In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import warnings
import joblib

In [2]:

warnings.filterwarnings('ignore')
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to C:\Users\SRISHTI
[nltk_data]     BULLA\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\SRISHTI
[nltk_data]     BULLA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# Initialize stemmer and stopwords
stemmer = SnowballStemmer("english")
stopword = set(stopwords.words("english"))

In [4]:
# Load the dataset
data = pd.read_csv("sentiment_dataset.csv")

In [5]:
# Map labels to appropriate categories
data['labels'] = data['label'].map({"Neutral": 0, "Offensive": 1, "Not Offensive": 2})

In [6]:
# Function to tokenize and stem the text
def tokenize_stem(text):
    tokens = nltk.word_tokenize(text)
    tokens = [word for word in tokens if word not in stopword]
    tokens = [word for word in tokens if word.isalpha()]
    tokens = [stemmer.stem(word) for word in tokens]
    return tokens

In [7]:
# Initialize CountVectorizer with the custom tokenizer
vectorizer = CountVectorizer(tokenizer=tokenize_stem)

In [8]:
# Transform the tweet text into feature vectors
X = vectorizer.fit_transform(data['text'])


In [9]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, data['labels'], test_size=0.2, random_state=42)


In [10]:
# Initialize Random Forest Classifier
classifier = RandomForestClassifier()
# Train the classifier
classifier.fit(X_train, y_train)

In [11]:
# Predict on the test set
y_pred = classifier.predict(X_test)


In [12]:
# Calculate F1 score
f1 = f1_score(y_test, y_pred, average='weighted')
print(f"F1 Score: {f1 * 100:.2f}%")

F1 Score: 85.95%


In [13]:
# Save the trained model to a file
joblib.dump(classifier, 'sentiment_model.joblib')
joblib.dump(vectorizer, 'vectorizer.joblib')

['vectorizer.joblib']

In [14]:
# Function to classify a new tweet
def classify_tweet(tweet):
    tweet_vector = vectorizer.transform([tweet])
    prediction = classifier.predict(tweet_vector)
    label_map = {0: "Neutral", 1: "Offensive", 2: "Not Offensive"}
    return label_map[prediction[0]]

In [15]:
# Example usage
tweet = input("Enter your tweet: ")
print(f"The tweet is classified as: {classify_tweet(tweet)}")

Enter your tweet: I love you
The tweet is classified as: Not Offensive


In [16]:
# Example usage
tweet = input("Enter your tweet: ")
print(f"The tweet is classified as: {classify_tweet(tweet)}")

Enter your tweet: Dont act like a bitch
The tweet is classified as: Offensive


In [17]:
# Example usage
tweet = input("Enter your tweet: ")
print(f"The tweet is classified as: {classify_tweet(tweet)}")

Enter your tweet: I think we shouldn't do this anymore
The tweet is classified as: Offensive


In [18]:
# Example usage
tweet = input("Enter your tweet: ")
print(f"The tweet is classified as: {classify_tweet(tweet)}")


Enter your tweet: I think you should stop this
The tweet is classified as: Not Offensive
