In [8]:
import spacy
nlp = spacy.load('en_core_web_lg')
nlp(u'lion').vector # the array has dimension of 300

nlp(u'The quick brown fox jumps over the lazy dog').vector
nlp(u'Fox').vector

array([ -1.4639  ,  -2.4799  ,  -3.8651  ,  -3.0494  ,   0.5724  ,
         6.3227  ,   0.68117 ,   3.3025  ,   4.7643  ,  -4.5324  ,
        -5.0241  ,   1.2499  ,   4.9338  ,  -3.6056  ,  -3.7387  ,
        -1.068   ,   2.5113  ,   3.3629  ,   1.611   ,  11.62    ,
        -1.0748  ,   0.77754 ,   7.5661  ,   0.077073,   0.73049 ,
         3.3523  ,   1.5292  ,   2.5581  ,  -5.3649  ,  -5.7513  ,
        -5.2362  ,  11.104   ,  -2.5218  ,  11.061   ,  -7.5375  ,
        -2.0597  ,   2.3736  ,  -3.9835  ,  -3.1375  ,  -1.5505  ,
         2.5327  ,   1.8263  ,   1.5949  ,   1.9807  ,   1.6303  ,
        -2.579   ,   1.5565  ,   1.7248  ,   0.1474  ,   3.2506  ,
        -0.58425 ,   3.9331  ,  -0.19418 ,   1.0491  ,   2.0897  ,
        -5.6368  ,  -1.912   ,  -9.2915  ,  -3.9354  ,   2.8393  ,
        -2.9731  ,  -3.0288  ,   0.5511  ,   7.0456  ,   3.9736  ,
        -4.4724  ,  -5.1859  ,   0.628   ,   2.5993  ,  -0.80378 ,
         1.4134  ,  -2.048   ,  -1.8045  ,  -6.1265  ,  -5.161

In [11]:
token = nlp(u'like love hate')
for token1 in token:
    for token2 in token:
        print(token1.text,token2.text,token1.similarity(token2))


like like 1.0
like love 0.5212638974189758
like hate 0.5065140724182129
love like 0.5212638974189758
love love 1.0
love hate 0.5708349943161011
hate like 0.5065140724182129
hate love 0.5708349943161011
hate hate 1.0


In [15]:
import spacy
from scipy import spatial

nlp = spacy.load('en_core_web_lg')
print(nlp.vocab.vectors)
print(len(nlp.vocab.vectors))
print(nlp.vocab.vectors.shape)

tokens = nlp(u"dog cat nargle Vaibhav")

for token in tokens:
    print(token.text,token.has_vector,token.vector_norm,token.is_oov)


<spacy.vectors.Vectors object at 0x0000018B5BF04C40>
514157
(514157, 300)
dog True 75.254234 False
cat True 63.188496 False
nargle False 0.0 True
Vaibhav True 13.864747 False


In [21]:
from scipy import spatial

cosine_similarity = lambda x, y: 1-spatial.distance.cosine(x,y)

king = nlp.vocab['king'].vector
man = nlp.vocab['man'].vector
woman = nlp.vocab['woman'].vector

new_vector = king - man + woman
computed_similarities = []

for word in nlp.vocab:
    if word.has_vector:
        if word.is_lower:
            if word.is_alpha:
                similarity = cosine_similarity(new_vector,word.vector)
                computed_similarities.append((word,similarity))

computed_similarities = sorted(computed_similarities,key = lambda item: -item[1])
print([w[0].text for w in computed_similarities[:50]])


['king', 'and', 'that', 'where', 'she', 'they', 'woman', 'there', 'should', 'these', 'would', 'those', 'cause', 'ought', 'who', 'might', 'not', 'this', 'when', 'could', 'somethin', 'were', 'all', 'a', 'have', 'he', 'must', 'space', 'what', 'may', 'it', 'was', 'you', 'or', 'had', 'are', 'nothin', 'has', 'need', 'can', 'is', 'how', 'b', 'does', 're', 'p', 'havin', 'on', 'why', 'man']


In [25]:
# sentiment analysis
import nltk

nltk.download('vader_lexicon')

from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

a = 'This is the best movie i ever read! though it had a scope of improvement'
print(sid.polarity_scores(a))
b = 'This is the worst movie ever!!!'
print(sid.polarity_scores(b))

{'neg': 0.0, 'neu': 0.595, 'pos': 0.405, 'compound': 0.8172}
{'neg': 0.499, 'neu': 0.501, 'pos': 0.0, 'compound': -0.7163}


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Vaibhav\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [18]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score
from sklearn.ensemble import RandomForestClassifier
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

# Load the dataset
df = pd.read_csv(r"C:\Users\Vaibhav\OneDrive\Documents\FolderPython\Artificial_Intelligence\Natural_Language_Processing\Nlp_revision\TextFiles\amazonreviews.tsv", sep='\t')
sid2 = SentimentIntensityAnalyzer()

# Initialize new columns for sentiment scores
df['sentiment_pos'] = 0
df['sentiment_neu'] = 0
df['sentiment_neg'] = 0

# Calculate polarity scores for each review
for i in range(len(df)):
    scores = sid2.polarity_scores(df['review'].iloc[i])
    df.at[i, 'sentiment_pos'] = scores['pos']
    df.at[i, 'sentiment_neu'] = scores['neu']
    df.at[i, 'sentiment_neg'] = scores['neg']

# Correctly select the features and label
x = df[['review', 'sentiment_neg', 'sentiment_neu', 'sentiment_pos']]
y = df['label']

# Split the data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

# Create a pipeline with TfidfVectorizer and RandomForestClassifier
rdclf = Pipeline([
    ('v', TfidfVectorizer()),  # Vectorizing the review text
    ('clf', RandomForestClassifier())  # Using RandomForestClassifier
])

# Fit the model
rdclf.fit(x_train['review'], y_train)

# Make predictions
y_pred = rdclf.predict(x_test['review'])

# Evaluate accuracy
print(accuracy_score(y_pred, y_test))

# the accuracy is increased from 81 to 83 percent


  df.at[i, 'sentiment_pos'] = scores['pos']
  df.at[i, 'sentiment_neu'] = scores['neu']
  df.at[i, 'sentiment_neg'] = scores['neg']


0.8305
