In [None]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns


In [24]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to C:\Users\Muhammad
[nltk_data]     Zaqeem\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [29]:
col=['Id','Country','Sentiment','Text']
data=pd.read_csv('twitter_training.csv',names=col)
data.columns = data.columns.str.strip()
data

Unnamed: 0,Id,Country,Sentiment,Text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
...,...,...,...,...
74677,9200,Nvidia,Positive,Just realized that the Windows partition of my...
74678,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74679,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74680,9200,Nvidia,Positive,Just realized between the windows partition of...


In [26]:
def clean_text(text):
    if isinstance(text, str):  # Check if the entry is a string
        text = text.lower()  # Convert to lowercase
        text = re.sub(r'\W', ' ', text)  # Remove all non-word characters (punctuation, etc.)
        text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
        text = ' '.join([word for word in text.split() if word not in stop_words])  # Remove stopwords
        return text
    else:
        return ""  # Return an empty string for NaN values

# Step 4: Apply Text Cleaning to the Text Column
data['Text'] = data['Text'].apply(clean_text)

In [28]:
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(data['Text']).toarray()

In [30]:
encoder = LabelEncoder()
data['Sentiment'] = encoder.fit_transform(data['Sentiment'])
y = data['Sentiment']

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [34]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import time


In [35]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "SVC": SVC(),
    "KNN": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier()
}

In [None]:
results = {
    "Model": [],
    "Accuracy": [],
    "Training Time (s)": [],
    "Testing Time (s)": []
}

for model_name, model in models.items():
 
    start_train = time.time()
    model.fit(X_train, y_train)
    end_train = time.time()
    
    start_test = time.time()
    y_pred = model.predict(X_test)
    end_test = time.time()
    
    accuracy = accuracy_score(y_test, y_pred)
    training_time = end_train - start_train
    testing_time = end_test - start_test
   
    results["Model"].append(model_name)
    results["Accuracy"].append(accuracy)
    results["Training Time (s)"].append(training_time)
    results["Testing Time (s)"].append(testing_time)


results_df = pd.DataFrame(results)
print(results_df)

In [None]:
# Set the style of the plots
sns.set(style="whitegrid")

# Plot accuracy comparison
plt.figure(figsize=(10, 6))
sns.barplot(x=results_df.index, y=results_df["Accuracy"], palette="viridis")
plt.title("Model Accuracy Comparison")
plt.xlabel("Model")
plt.ylabel("Accuracy")
plt.xticks(rotation=45)
plt.show()


In [None]:
# Plot training time comparison
plt.figure(figsize=(10, 6))
sns.barplot(x=results_df.index, y=results_df["Training Time (s)"], palette="rocket")
plt.title("Model Training Time Comparison")
plt.xlabel("Model")
plt.ylabel("Training Time (seconds)")
plt.xticks(rotation=45)
plt.show()


In [None]:
# Plot testing time comparison
plt.figure(figsize=(10, 6))
sns.barplot(x=results_df.index, y=results_df["Testing Time (s)"], palette="mako")
plt.title("Model Testing Time Comparison")
plt.xlabel("Model")
plt.ylabel("Testing Time (seconds)")
plt.xticks(rotation=45)
plt.show()
