In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json


In [None]:
"jp797498e/twitter-entity-sentiment-analysis"

In [None]:
!kaggle datasets download -d jp797498e/twitter-entity-sentiment-analysis


Dataset URL: https://www.kaggle.com/datasets/jp797498e/twitter-entity-sentiment-analysis
License(s): CC0-1.0
Downloading twitter-entity-sentiment-analysis.zip to /content
  0% 0.00/1.99M [00:00<?, ?B/s]
100% 1.99M/1.99M [00:00<00:00, 635MB/s]


In [None]:
!unzip /content/twitter-entity-sentiment-analysis.zip

Archive:  /content/twitter-entity-sentiment-analysis.zip
  inflating: twitter_training.csv    
  inflating: twitter_validation.csv  


In [None]:
import pandas as pd
training_df = pd.read_csv("/content/twitter_training.csv")
validation_df = pd.read_csv("/content/twitter_validation.csv")


In [None]:
training_df.head(5)

Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [None]:
training_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74681 entries, 0 to 74680
Data columns (total 4 columns):
 #   Column                                                 Non-Null Count  Dtype 
---  ------                                                 --------------  ----- 
 0   2401                                                   74681 non-null  int64 
 1   Borderlands                                            74681 non-null  object
 2   Positive                                               74681 non-null  object
 3   im getting on borderlands and i will murder you all ,  73995 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.3+ MB


In [None]:
# Step 1: Rename columns for clarity
training_df.columns = ['id', 'category', 'label', 'text']

# Step 2: Drop rows where text is missing
training_df = training_df.dropna(subset=['text'])

# Step 3: Drop duplicates if any
training_df = training_df.drop_duplicates()

In [None]:
# Step 1: Rename columns for clarity
validation_df.columns = ['id', 'category', 'label', 'text']

# Step 2: Drop rows where text is missing
validation_df = validation_df.dropna(subset=['text'])

# Step 3: Drop duplicates if any
validation_df = validation_df.drop_duplicates()

In [None]:
validation_df.count()

Unnamed: 0,0
id,999
category,999
label,999
text,999


In [None]:
training_df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
Negative,21698
Positive,19712
Neutral,17708
Irrelevant,12537


In [None]:
validation_df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
Neutral,285
Positive,277
Negative,266
Irrelevant,171


In [None]:
import re

def clean_text(text):
    text = text.lower()  # lowercase
    text = re.sub(r"http\S+", "", text)  # remove URLs
    text = re.sub(r"[^a-z\s]", "", text)  # remove special characters
    text = re.sub(r"\s+", " ", text).strip()  # remove extra spaces
    return text

training_df['clean_text'] = training_df['text'].apply(clean_text)

In [None]:
validation_df['clean_text'] = training_df['text'].apply(clean_text)

In [None]:
training_df.info()
training_df[['text', 'clean_text']].head()


<class 'pandas.core.frame.DataFrame'>
Index: 71655 entries, 0 to 74680
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          71655 non-null  int64 
 1   category    71655 non-null  object
 2   label       71655 non-null  object
 3   text        71655 non-null  object
 4   clean_text  71655 non-null  object
dtypes: int64(1), object(4)
memory usage: 5.3+ MB


Unnamed: 0,text,clean_text
0,I am coming to the borders and I will kill you...,i am coming to the borders and i will kill you...
1,im getting on borderlands and i will kill you ...,im getting on borderlands and i will kill you all
2,im coming on borderlands and i will murder you...,im coming on borderlands and i will murder you...
3,im getting on borderlands 2 and i will murder ...,im getting on borderlands and i will murder yo...
4,im getting into borderlands and i can murder y...,im getting into borderlands and i can murder y...


In [None]:
!pip install nltk

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(training_df['text'])
X_validation = vectorizer.transform(validation_df['text'])

In [None]:
y_train = training_df['label']
y_validation = validation_df['label']

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report

# Train Random Forest Classifier
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train, y_train)

# Predict on validation set
y_pred = rf_clf.predict(X_validation)

# Confusion matrix
cm = confusion_matrix(y_validation, y_pred)
print("Confusion Matrix:\n", cm)

# Classification report for more detailed evaluation
print("\nClassification Report:\n", classification_report(y_validation, y_pred))

Confusion Matrix:
 [[164   3   0   4]
 [  0 261   4   1]
 [  0   3 278   4]
 [  2   4   5 266]]

Classification Report:
               precision    recall  f1-score   support

  Irrelevant       0.99      0.96      0.97       171
    Negative       0.96      0.98      0.97       266
     Neutral       0.97      0.98      0.97       285
    Positive       0.97      0.96      0.96       277

    accuracy                           0.97       999
   macro avg       0.97      0.97      0.97       999
weighted avg       0.97      0.97      0.97       999



In [None]:
import joblib

# Save your trained model
joblib.dump(rf_clf, 'rf_model.pkl')

# (Optional) Save the vectorizer if you're using text data
joblib.dump(vectorizer, 'vectorizer.pkl')


['vectorizer.pkl']

In [None]:
from google.colab import files

# Download the files
files.download('rf_model.pkl')
files.download('vectorizer.pkl')  # if you used it


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>