<a href="https://colab.research.google.com/github/swarajko/twitter-sentiment-analysis-NLP/blob/main/twittersentiment(RF).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Natural Language Processing

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [2]:
rawdataset = pd.read_csv('/content/twitter_training.csv')

# Drop rows where the third column contains "Irrelevant"
dataset = rawdataset[rawdataset.iloc[:, 2] != "Irrelevant"]

# Save the cleaned data
cleaned_file_path = "/content/cleaned_twitter.csv"
dataset.to_csv(cleaned_file_path)

# Return the path to the cleaned file
cleaned_file_path

'/content/cleaned_twitter.csv'

In [3]:
y = dataset.iloc[:, 2].values
print(y)

['Positive' 'Positive' 'Positive' ... 'Positive' 'Positive' 'Positive']


In [4]:
mapping = {'Neutral': 0, 'Positive': 1, 'Negative': -1}

# Apply mapping
y_encoded = np.vectorize(mapping.get)(y)

print(y_encoded)

[1 1 1 ... 1 1 1]


In [5]:

column_names = ['Column1', 'Column2', 'Column3', 'Review']

# Assign column names to the dataset
dataset.columns = column_names

# Print the first few rows to verify
print(dataset.head())


   Column1      Column2   Column3  \
0     2401  Borderlands  Positive   
1     2401  Borderlands  Positive   
2     2401  Borderlands  Positive   
3     2401  Borderlands  Positive   
4     2401  Borderlands  Positive   

                                              Review  
0  I am coming to the borders and I will kill you...  
1  im getting on borderlands and i will kill you ...  
2  im coming on borderlands and i will murder you...  
3  im getting on borderlands 2 and i will murder ...  
4  im getting into borderlands and i can murder y...  


## Cleaning the texts

In [6]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
for i in range(0, 61691):
  review = re.sub(r'[^a-zA-Z]', ' ', str(dataset['Review'].iloc[i]))
  review = review.lower()
  review = review.split()
  ps = PorterStemmer()
  all_stopwords = stopwords.words('english')
  all_stopwords.remove('not')
  review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
  review = ' '.join(review)
  corpus.append(review)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [7]:
print(corpus)



## Creating the Bag of Words model

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 19000)
X = cv.fit_transform(corpus).toarray()

In [9]:
len(X[0])

19000

## Splitting the dataset into the Training set and Test set

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_encoded_train, y_encoded_test = train_test_split(X, y_encoded, test_size = 0.20, random_state = 0)

## Training the Random Forest model on the Training set




In [12]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_encoded_train)

## Predicting the Test set results

In [13]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_encoded_test.reshape(len(y_encoded_test),1)),1))

[[ 1  1]
 [ 0  0]
 [-1 -1]
 ...
 [ 0  0]
 [ 1  1]
 [ 0  0]]


## Making the Confusion Matrix

In [14]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_encoded_test, y_pred)
print(cm)
accuracy_score(y_encoded_test, y_pred)

[[4121  133  256]
 [ 174 3261  268]
 [ 202  150 3774]]


0.9041251316962476