In [1]:
# !python -m spacy download en_core_web_lg

In [2]:
import spacy

In [3]:
nlp = spacy.load("en_core_web_lg")



In [4]:
x = "dog cat lion dsfaf"

In [5]:
doc = nlp(x)

In [6]:
for token in doc:
    
    print(f"Text : {token.text:{10}}, Has Vector : {token.has_vector:{10}}, Vector Norm : {token.vector_norm,2}")

Text : dog       , Has Vector :          1, Vector Norm : (7.0336733, 2)
Text : cat       , Has Vector :          1, Vector Norm : (6.6808186, 2)
Text : lion      , Has Vector :          1, Vector Norm : (6.5120897, 2)
Text : dsfaf     , Has Vector :          0, Vector Norm : (0.0, 2)


## Semantic Similarity

In [7]:
for token1 in doc:
    for token2 in doc:
        
        print(f"Token 1 : {token1.text:{7}}, Token 2: {token2.text:{7}}, Similarity : {token1.similarity(token2)}")

Token 1 : dog    , Token 2: dog    , Similarity : 1.0
Token 1 : dog    , Token 2: cat    , Similarity : 0.8016855120658875
Token 1 : dog    , Token 2: lion   , Similarity : 0.4742448627948761
Token 1 : dog    , Token 2: dsfaf  , Similarity : 0.0
Token 1 : cat    , Token 2: dog    , Similarity : 0.8016855120658875
Token 1 : cat    , Token 2: cat    , Similarity : 1.0
Token 1 : cat    , Token 2: lion   , Similarity : 0.5265436768531799
Token 1 : cat    , Token 2: dsfaf  , Similarity : 0.0
Token 1 : lion   , Token 2: dog    , Similarity : 0.4742448627948761
Token 1 : lion   , Token 2: cat    , Similarity : 0.5265436768531799
Token 1 : lion   , Token 2: lion   , Similarity : 1.0
Token 1 : lion   , Token 2: dsfaf  , Similarity : 0.0
Token 1 : dsfaf  , Token 2: dog    , Similarity : 0.0
Token 1 : dsfaf  , Token 2: cat    , Similarity : 0.0
Token 1 : dsfaf  , Token 2: lion   , Similarity : 0.0
Token 1 : dsfaf  , Token 2: dsfaf  , Similarity : 1.0


  after removing the cwd from sys.path.


## Data Preparation

In [8]:
import pandas as pd
import seaborn as sns
import numpy as np
import Preprocess_gokhanEr as pp

from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from sklearn.metrics import classification_report,confusion_matrix

In [9]:
path = "/Users/gokhanersoz/Desktop/Hepsi/NLP/Data/imdb_reviews.txt"

df = pd.read_csv(path , sep = "\t", header = None)
df.columns = ["Reviews","Sentiment"]
df.head()

Unnamed: 0,Reviews,Sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [10]:
%%time
df["Reviews"] = df["Reviews"].apply(lambda words : pp.cont_exp(words)) 
df["Reviews"] = df["Reviews"].apply(lambda words : pp.cont_exp(words)) 

df["Reviews"] = df["Reviews"].apply(lambda words : pp.remove_accented_chars(words)) 
df["Reviews"] = df["Reviews"].apply(lambda words : pp.remove_emails(words)) 
df["Reviews"] = df["Reviews"].apply(lambda words : pp.remove_html_tags(words)) 
df["Reviews"] = df["Reviews"].apply(lambda words : pp.remove_urls(words)) 
df["Reviews"] = df["Reviews"].apply(lambda words : pp.get_make_base(words)) 
df["Reviews"] = df["Reviews"].apply(lambda words : " ".join(pp.spelling_correction(words).words)) 

CPU times: user 1min 31s, sys: 108 ms, total: 1min 31s
Wall time: 1min 31s


In [11]:
df.head()

Unnamed: 0,Reviews,Sentiment
0,a very very very slow move aimless movie about...,0
1,not sure who was more lose the flat character ...,0
2,attempt artless with black white and clever ca...,0
3,very little music or anything to speak of,0
4,the good scene in the movie was when Gerard is...,1


## ML Model Building

In [12]:
import spacy

In [13]:
nlp = spacy.load("en_core_web_lg")

x = "cat dog"
doc = nlp(x)
doc



cat dog

In [14]:
(doc.vector.shape),(doc.vector.ndim)

((300,), 1)

In [15]:
def get_vec(words):
    
    doc = nlp(words)
    vec = doc.vector
    return vec

In [16]:
df["Vec"] = df["Reviews"].apply(lambda words : get_vec(words))

df.head()

Unnamed: 0,Reviews,Sentiment,Vec
0,a very very very slow move aimless movie about...,0,"[-0.08032064, 0.124854855, -0.24590585, 0.1456..."
1,not sure who was more lose the flat character ...,0,"[0.062192187, 0.1952087, -0.14579107, -0.00481..."
2,attempt artless with black white and clever ca...,0,"[-0.21530148, 0.0040732734, -0.12996358, -0.07..."
3,very little music or anything to speak of,0,"[-0.09093174, 0.25162372, -0.25681874, 0.15846..."
4,the good scene in the movie was when Gerard is...,1,"[0.064886056, 0.13270056, -0.15480983, -0.0207..."


In [17]:
print("DataFrame Shape : {}".format(df.shape))

DataFrame Shape : (748, 3)


In [18]:
X = df["Vec"].to_numpy()
X = X.reshape(-1,1)
X.shape

(748, 1)

In [19]:
liste = []

for vec in df["Vec"].values:
    liste.append(vec)

X = np.array(liste)
print("Last X Shape : {}".format(X.shape))

Last X Shape : (748, 300)


In [20]:
y = df["Sentiment"]

In [21]:
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size = 0.2, random_state = 0, stratify = y)

## ML Model Training And Testing

In [22]:
logistic = LogisticRegression(solver = "liblinear")
logistic.fit(X_train, y_train)

LogisticRegression(solver='liblinear')

In [23]:
y_pred = logistic.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.79      0.84      0.81        73
           1       0.84      0.79      0.81        77

    accuracy                           0.81       150
   macro avg       0.81      0.81      0.81       150
weighted avg       0.81      0.81      0.81       150



In [24]:
svc = SVC()
svc.fit(X_train,y_train)

SVC()

In [25]:
y_pred = svc.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.79      0.81      0.80        73
           1       0.81      0.79      0.80        77

    accuracy                           0.80       150
   macro avg       0.80      0.80      0.80       150
weighted avg       0.80      0.80      0.80       150



## Grid Search Cross Validation For HyperParamters Tuning

In [26]:
logistic = LogisticRegression(solver = "liblinear")

In [27]:
hyperparameters = {
    
    "penalty" : ["l1","l2"],
    "C" : [1,2,3,4]
}

In [28]:
def best_grid(classification , hyperparamaters, X, y, cv = 5 ,score = "accuracy"):
    
    best_ = GridSearchCV(estimator=classification,
                         param_grid = hyperparamaters,
                         n_jobs=-1,
                         verbose = 0,
                         scoring=score,
                         cv = cv).fit(X,y)
    
    print(f" {type(classification).__name__.upper()} ".center(50,"#"))
    print()
    print("Best Params :\n\n {}".format(best_.best_params_))
    print()
    print("Best Score : \n\n {}".format(best_.best_score_))
    
    return best_

In [29]:
best_logistic = best_grid(logistic,hyperparameters,X_train,y_train,cv = 5)

############### LOGISTICREGRESSION ###############

Best Params :

 {'C': 2, 'penalty': 'l2'}

Best Score : 

 0.8311064425770308


In [30]:
y_pred = svc.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.79      0.81      0.80        73
           1       0.81      0.79      0.80        77

    accuracy                           0.80       150
   macro avg       0.80      0.80      0.80       150
weighted avg       0.80      0.80      0.80       150



In [31]:
y_pred_best = best_logistic.predict(X_test)
print(classification_report(y_test,y_pred_best))

              precision    recall  f1-score   support

           0       0.78      0.85      0.81        73
           1       0.84      0.77      0.80        77

    accuracy                           0.81       150
   macro avg       0.81      0.81      0.81       150
weighted avg       0.81      0.81      0.81       150



In [32]:
x = "a very very very slow move aimless movie about a distressed drift young man"
new_x = nlp(x).vector.reshape(1,300)
new_x.shape

(1, 300)

In [33]:
def convert_words(words):
    
    nlp = spacy.load("en_core_web_lg")
    vec = nlp(words).vector.reshape(1,300)
    
    return vec

In [34]:
best_logistic.predict(convert_words(x))



array([0])

In [35]:
df["Reviews"][1],df["Sentiment"][1]

('not sure who was more lose the flat character or the audience nearly half of whom walk out',
 0)

In [36]:
best_logistic.predict(convert_words(df["Reviews"][1]))



array([0])

In [37]:
df["Sentiment"][1]

0