# NLP ASSIGNMENT

# DATA

In [1]:
import pandas as pd

In [2]:
#TEST DATA
test = [
    ('The beer was good.', 'pos'),
    ('I do not enjoy my job', 'neg'),
    ("I ain't feeling dandy today.", 'neg'),
    ("I feel amazing!", 'pos'),
    ('Gary is a friend of mine.', 'pos'),
    ("I can't believe I'm doing this.", 'neg')
]

In [3]:
#TRAIN DATA
train = [('I love this sandwich.', 'pos'),
   ('This is an amazing place!', 'pos'),
   ('I feel very good about these beers.', 'pos'),
   ('This is my best work.', 'pos'),
   ("What an awesome view", 'pos'),
   ('I do not like this restaurant', 'neg'),
   ('I am tired of this stuff.', 'neg'),
   ("I can't deal with this", 'neg'),
   ('He is my sworn enemy!', 'neg'),
   ('My boss is horrible.', 'neg')]

In [4]:
train = pd.DataFrame(train)
test = pd.DataFrame(test)

In [5]:
#TRAIN + TEST DATA
data = pd.concat([train, test])
data.columns = ["sentence", "label"]
data = data.reset_index(drop=True)
data

Unnamed: 0,sentence,label
0,I love this sandwich.,pos
1,This is an amazing place!,pos
2,I feel very good about these beers.,pos
3,This is my best work.,pos
4,What an awesome view,pos
5,I do not like this restaurant,neg
6,I am tired of this stuff.,neg
7,I can't deal with this,neg
8,He is my sworn enemy!,neg
9,My boss is horrible.,neg


# PREPROCESSING THE DATA

In [6]:
#Label Encoding the Y Column
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
data["label"] = le.fit_transform(data["label"])
data

Unnamed: 0,sentence,label
0,I love this sandwich.,1
1,This is an amazing place!,1
2,I feel very good about these beers.,1
3,This is my best work.,1
4,What an awesome view,1
5,I do not like this restaurant,0
6,I am tired of this stuff.,0
7,I can't deal with this,0
8,He is my sworn enemy!,0
9,My boss is horrible.,0


In [7]:
corpus = data["sentence"].tolist()
corpus

['I love this sandwich.',
 'This is an amazing place!',
 'I feel very good about these beers.',
 'This is my best work.',
 'What an awesome view',
 'I do not like this restaurant',
 'I am tired of this stuff.',
 "I can't deal with this",
 'He is my sworn enemy!',
 'My boss is horrible.',
 'The beer was good.',
 'I do not enjoy my job',
 "I ain't feeling dandy today.",
 'I feel amazing!',
 'Gary is a friend of mine.',
 "I can't believe I'm doing this."]

In [8]:
# LIST OF STOP WORDS AND PUNCTUATIONS
from nltk.corpus import stopwords
from string import punctuation
stuff_to_be_removed = list(stopwords.words("english"))+list(punctuation)

In [9]:
from nltk.tokenize import word_tokenize
from nltk.stem.lancaster import LancasterStemmer
lancaster_stemmer = LancasterStemmer()
stemmer = LancasterStemmer()

In [10]:
# Function to tokenize, stem and remove stop words
final_corpus= []
for i in range(len(corpus)):
    text = word_tokenize(corpus[i].lower())
    text = [lancaster_stemmer.stem(y) for y in text if y not in stuff_to_be_removed]
    sent = " ".join(text)
    
    final_corpus.append(sent)

In [11]:
new_df = pd.DataFrame(final_corpus)

In [14]:
new_df

Unnamed: 0,0
0,lov sandwich
1,amaz plac
2,feel good beer
3,best work
4,awesom view
5,lik resta
6,tir stuff
7,ca n't deal
8,sworn enemy
9,boss horr


In [15]:
new_df["label"] = data["label"]

In [16]:
new_df

Unnamed: 0,0,label
0,lov sandwich,1
1,amaz plac,1
2,feel good beer,1
3,best work,1
4,awesom view,1
5,lik resta,0
6,tir stuff,0
7,ca n't deal,0
8,sworn enemy,0
9,boss horr,0


In [17]:
new_df.columns = ["sentences", "labels"]

In [18]:
new_df

Unnamed: 0,sentences,labels
0,lov sandwich,1
1,amaz plac,1
2,feel good beer,1
3,best work,1
4,awesom view,1
5,lik resta,0
6,tir stuff,0
7,ca n't deal,0
8,sworn enemy,0
9,boss horr,0


# CALCULATING TF-IDF

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()

In [20]:
vector= tfidf.fit_transform(new_df["sentences"])

In [21]:
vector

<16x30 sparse matrix of type '<class 'numpy.float64'>'
	with 36 stored elements in Compressed Sparse Row format>

In [22]:
vector.toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.70710678,
        0.        , 0.        , 0.        , 0.70710678, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.65674042, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.75411672, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.59754328, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.53468128, 0.

# X-TRAIN, Y-TRAIN, X-TEST, Y-TEST SPLIT

In [23]:
X = vector.toarray()
y = new_df["labels"]

In [24]:
X_train=  X[0:10 ,:]
X_train

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.70710678,
        0.        , 0.        , 0.        , 0.70710678, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.65674042, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.75411672, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.59754328, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.53468128, 0.

In [25]:
X_test=  X[10:16 ,:]
X_test

array([[0.        , 0.        , 0.        , 0.70710678, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.70710678, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.70710678, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.70710678, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.52651698, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.52651698, 0.        ,
        0.        , 0.        , 0.41029211, 0.

In [26]:
y_train = y[0:10] 
y_test = y[10:16]

# MODEL - Gaussian Naive Bayes

In [27]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

In [28]:
gnb.fit(X_train,y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [29]:
gnb.score(X_test, y_test)

0.6666666666666666

# MODEL - Logistic Regression

In [30]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()

In [31]:
logreg.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [32]:
logreg.score(X_test, y_test)

0.6666666666666666

# MODEL - Linear Support Vector Machine

In [33]:
from sklearn.svm import SVC
svc = SVC(kernel='linear') 

In [34]:
svc.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [35]:
svc.score(X_test, y_test)

0.6666666666666666