# NLP Application

## Importing libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt

## Loading dataset

you can either use sep = '\t' or delimiter = '/t'

In [None]:
df = pd.read_csv('D16data2.tsv',sep='\t')
df

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
...,...,...
995,I think food should have flavor and texture an...,0
996,Appetite instantly gone.,0
997,Overall I was not impressed and would not go b...,0
998,"The whole experience was underwhelming, and I ...",0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  1000 non-null   object
 1   Liked   1000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 15.8+ KB


In [None]:
df.describe()

Unnamed: 0,Liked
count,1000.0
mean,0.5
std,0.50025
min,0.0
25%,0.0
50%,0.5
75%,1.0
max,1.0


## Checking NULL values

In [None]:
df.isnull().sum()

Review    0
Liked     0
dtype: int64

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
import re

In [None]:
from nltk import corpus
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

corpus = []

for i in range(0,1000):
  review = re.sub('[^a-zA-Z]',' ',df['Review'][i])
  review = review.lower()
  review = review.split()
  ps = PorterStemmer()
  review = [ps.stem(word) for word in review
            if not word in set(stopwords.words('english'))]
  review = ' '.join(review)
  corpus.append(review)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1500)

In [None]:
x = cv.fit_transform(corpus).toarray()

In [None]:
y = df.iloc[:,1].values

## Splitting data into training and testing

In [None]:
from scipy.sparse import rand
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2, random_state=0)

## Modeling

In [None]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(x_train,y_train)

## Prediction

In [None]:
y_pred = classifier.predict(x_test)

## Evaluation

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

### Classification Report

In [None]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.82      0.57      0.67        97
           1       0.68      0.88      0.77       103

    accuracy                           0.73       200
   macro avg       0.75      0.73      0.72       200
weighted avg       0.75      0.73      0.72       200



### Confusion Matrix

In [None]:
confusion_matrix(y_test,y_pred)

array([[55, 42],
       [12, 91]])

### Accuracy Score

In [None]:
accuracy_score(y_test,y_pred)

0.73

## Decision Tree

### Importing DecisionTreeClassifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

### Modeling

In [None]:
dtc = DecisionTreeClassifier()
dtc.fit(x_train,y_train)

### Prediction

In [None]:
y_pred = dtc.predict(x_test)

### Evaluation

#### Classification Report

In [None]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.64      0.73      0.68        97
           1       0.71      0.61      0.66       103

    accuracy                           0.67       200
   macro avg       0.67      0.67      0.67       200
weighted avg       0.67      0.67      0.67       200



#### Accuracy Score

In [None]:
accuracy_score(y_test,y_pred)

0.67

#### Confusion Matrix

In [None]:
confusion_matrix(y_test,y_pred)

array([[71, 26],
       [40, 63]])

## Random Forest Algorithm

### Importing RandomForestClassifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

### Modeling

In [None]:
rfc = RandomForestClassifier()
rfc = rfc.fit(x_train,y_train)

### Prediction

In [None]:
y_pred = rfc.predict(x_test)

### Evaluation

#### Accuracy Score

In [None]:
accuracy_score(y_test,y_pred)

0.71

#### Confusion Matrix

In [None]:
confusion_matrix(y_test,y_pred)

array([[89,  8],
       [50, 53]])

#### Classification Report

In [None]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.64      0.92      0.75        97
           1       0.87      0.51      0.65       103

    accuracy                           0.71       200
   macro avg       0.75      0.72      0.70       200
weighted avg       0.76      0.71      0.70       200

