## **Importing Neccesary Libraries**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Regular Expression
import re

# Natural Language ToolKit
import nltk
from nltk.corpus import stopwords

# SciKit-Learn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score

## **Importing the DATASET from my GitHub Repository**

In [2]:
url ="https://github.com/yashwanth-yadav-0103/Machine_Learning/raw/main/Sentiment_Analysis/tweets_covid19_dataset.csv"
df =pd.read_csv(url)

# **Data Preprocessing**

In [3]:
print("The size of the dataset is \n {}".format(df.shape))

The size of the dataset is 
 (179859, 2)


In [4]:
print(df.columns.tolist())

['tweets', 'sentiment']


In [5]:
print(df.head(3))

                                              tweets sentiment
0  Chinese citizens caught faking COVID-19 tests ...   neutral
1  RT @RunesSmash: After Covid dies down, Can we ...  negative
2  RT @Neurophysik: Many COVID-19 patients recove...  positive


In [6]:
print(df.nunique())

tweets       64987
sentiment        3
dtype: int64


In [7]:
print(df.isnull().sum())

tweets       0
sentiment    0
dtype: int64


## **Features and Labels ...**

In [8]:
# The input / Independent variable
feature = df.iloc[:,0].values

# The output / Dependent variable
label = df.iloc[:,1].values

## **Data Processing with Regular Expression ...**

In [9]:
Filtered_data = []
for i in range(0, len(feature)):

    # To remove speacial characters
    Partial_Data = re.sub(r'\W', ' ', str(feature[i]))

    # To remove single characters
    Partial_Data= re.sub(r'\s+[a-zA-Z]\s+', ' ', Partial_Data)

    # To remove multiple spaces
    Partial_Data = re.sub(r'\s+', ' ', Partial_Data, flags=re.I)

    # To remove lower case
    Partial_Data = Partial_Data.lower()

    # Removing prefixed 'b'
    Partial_Data = re.sub(r'^b\s+', '', Partial_Data)

    # To remove single characters from the start
    Partial_Data = re.sub(r'\^[a-zA-Z]\s+', ' ', Partial_Data)


    Filtered_data.append(Partial_Data)

## **Removing StopWords and Feature Extraction...**
**(Converting Preprocessed Text Data into Numerical Data)**

In [10]:
nltk.download('stopwords')

vectorizer = TfidfVectorizer(max_features=2500, min_df=7, max_df=0.8, stop_words=stopwords.words('english'))

Final_data= vectorizer.fit_transform(Filtered_data).toarray()
print(Final_data)

[nltk_data] Downloading package stopwords to C:\Users\YASHWANTH
[nltk_data]     YADAV\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


## **Spliting DATASET into Train Data and Test Data**

In [11]:
X_train, X_test, y_train, y_test = train_test_split(Final_data, label, test_size=0.15, random_state=0)

## **Training the Model with Test Data using ML Algorithm (RFC)**

In [12]:
Model= RandomForestClassifier(n_estimators=200, random_state=0)
Model.fit(X_train, y_train)

## **Evaluating the Model**
**(Accuracy of the Model)**

In [None]:
predict_sentiment = Model.predict(X_test)
Model_accuracy=(accuracy_score(y_test, predict_sentiment))*100
print("The Model Accuracy is {:.4f} % ".format(Model_accuracy))

The Model Accuracy is 95.5521 % 
