# DSCI 614 Text Mining
# Pro 5: Vectors and Similarity 


#### 1. Load the dataset of Twitter_Data.csvLinks to an external site. into memory.

In [1]:
import pandas as pd
twitter_data=pd.read_csv('./Twitter_Data.csv')
twitter_data.head(10)

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0
5,kiya tho refresh maarkefir comment karo,0.0
6,surat women perform yagna seeks divine grace f...,0.0
7,this comes from cabinet which has scholars lik...,0.0
8,with upcoming election india saga going import...,1.0
9,gandhi was gay does modi,1.0


#### 2. Method 1: **dot** and **norm** from numpy ans scipy library

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from numpy import dot
from scipy.sparse.linalg import norm 

# Create a TfidfVectorizer Object using default parameters: use_idf=True, smooth_idf=True, sublinear_tf=False
tfidf_vectorizer = TfidfVectorizer(use_idf=True,smooth_idf=True, sublinear_tf=False)

# the 100th and 10,000th tweets 
observation2 = [twitter_data['clean_text'][99], twitter_data['clean_text'][9999]]

# Fit to the corpus, then convert a collection of raw documents to a matrix of TF-IDF features.
tf_idf_matrix = tfidf_vectorizer.fit_transform(observation2)

print(f'The size of the tf_idf matrix for the texts = {tf_idf_matrix.get_shape()}')

# Compute the cosine similarity based on the formula above
cos_sine =  dot(tf_idf_matrix[0, :], tf_idf_matrix[1, :].T)/(norm(tf_idf_matrix[0, :])*norm(tf_idf_matrix[1, :]))

print(f"The cosine similarity between {observation2[0]} and {observation2[1]}= {cos_sine.todense()}")

The size of the tf_idf matrix for the texts = (2, 51)
The cosine similarity between modi politics hate modiji loves india modiji want make new india corruption free terror free india hate not nature modiji yes modiji hates only enemies our country terrorists destroying terrorists camp hatedont defame our humane kind pure honest and knows once modi comes again his entire family will jail for corruption only way avoid jail need come back power will not hesitate even selling the country just come back power corruption his blood= [[0.05834325]]


#### 3. Method 2: **cosine** from Scipy Library

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import spatial
import numpy as np
# Create a TfidfVectorizer Object using default parameters: use_idf=True, smooth_idf=True, sublinear_tf=False
tfidf_vectorizer = TfidfVectorizer(use_idf=True,smooth_idf=True, sublinear_tf=False)

# the 100th and 10,000th tweets 
observation2 = [twitter_data['clean_text'][99], twitter_data['clean_text'][9999]]

# Fit to the corpus, then convert a collection of raw documents to a matrix of TF-IDF features.
tf_idf_matrix = tfidf_vectorizer.fit_transform(observation2)

arr0 = np.squeeze(np.asarray(tf_idf_matrix[0, :].todense()))
arr1 = np.squeeze(np.asarray(tf_idf_matrix[1, :].todense()))

print(f'The size of the tf_idf matrix for the texts = {tf_idf_matrix.get_shape()}')

# Compute cosine similarity based on cosine distance
cos_sim = 1 - spatial.distance.cosine(arr0, arr1)
print(f"The cosine similarity between {observation2[0]} and {observation2[1]}= {cos_sim}")

The size of the tf_idf matrix for the texts = (2, 51)
The cosine similarity between modi politics hate modiji loves india modiji want make new india corruption free terror free india hate not nature modiji yes modiji hates only enemies our country terrorists destroying terrorists camp hatedont defame our humane kind pure honest and knows once modi comes again his entire family will jail for corruption only way avoid jail need come back power will not hesitate even selling the country just come back power corruption his blood= 0.05834325199722945


#### 4. Method 3: **cosine_similarity** from sklearn library

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# Create a TfidfVectorizer Object using default parameters: use_idf=True, smooth_idf=True, sublinear_tf=False
tfidf_vectorizer = TfidfVectorizer(use_idf=True,smooth_idf=True, sublinear_tf=False)

# the 100th and 10,000th tweets 
observation2 = [twitter_data['clean_text'][99], twitter_data['clean_text'][9999]]

# Fit to the corpus, then convert a collection of raw documents to a matrix of TF-IDF features.
tf_idf_matrix = tfidf_vectorizer.fit_transform(observation2)

print(f'The size of the tf_idf matrix for the texts = {tf_idf_matrix.get_shape()}')

# Compute the cosine similarity using the built in function in sklearn library
cos_sim = cosine_similarity(tf_idf_matrix,dense_output=True)
print(f"The cosine similarity betwee {observation2[0]} and {observation2[1]}= {cos_sim[0,1]}")

The size of the tf_idf matrix for the texts = (2, 51)
The cosine similarity betwee modi politics hate modiji loves india modiji want make new india corruption free terror free india hate not nature modiji yes modiji hates only enemies our country terrorists destroying terrorists camp hatedont defame our humane kind pure honest and knows once modi comes again his entire family will jail for corruption only way avoid jail need come back power will not hesitate even selling the country just come back power corruption his blood= 0.058343251997229464


#### 5. Find the cosine similarity in clean_text between the 100th and 10,000th tweets using the Spacy function.

In [5]:
from sklearn.metrics.pairwise import cosine_similarity
import spacy

# Load the large model
nlp = spacy.load("en_core_web_lg")


# 100th and 10,000th tweets
text1 = twitter_data['clean_text'][99]
text2 = twitter_data['clean_text'][9999]
print("#"*80)
print(text1)
print("#"*80)
print("#"*80)
print(text2)
print("#"*80)
# Create nlp objects for the two docs
doc1 = nlp(text1)
doc2 = nlp(text2)
cos_sine = cosine_similarity(doc1.vector.reshape(1, -1),doc2.vector.reshape(1, -1),dense_output=True)
print(f"The similarity between {doc1} and {doc2} = {cos_sine[0][0]:.2f}")

################################################################################
modi politics hate modiji loves india modiji want make new india corruption free terror free india hate not nature modiji yes modiji hates only enemies our country terrorists destroying terrorists camp hatedont defame our humane kind pure honest
################################################################################
################################################################################
knows once modi comes again his entire family will jail for corruption only way avoid jail need come back power will not hesitate even selling the country just come back power corruption his blood
################################################################################
The similarity between modi politics hate modiji loves india modiji want make new india corruption free terror free india hate not nature modiji yes modiji hates only enemies our country terrorists destroying terrorists camp hatedont

Next, let's find the texts with the highest cosine similarity.

#### 6. Find the tweets with the cosine similarity > 60% with the 100th tweets using Spacy in this dataset.

In [6]:
import spacy

# Load the large model
nlp = spacy.load("en_core_web_lg")

# Take out the first two moview review
text1 = twitter_data['clean_text'][99]
print("#"*80)
print(text1)
print("#"*80)

# This is a big file
# process only 100 of them to demonstrate 
# the result
text = twitter_data['clean_text'][:100]


for text2 in text:
    doc1 = nlp(text1)
    doc2 = nlp(str(text2))
    
    if doc2.similarity(doc1) > 0.60:
        print("-"*80)
        print(text2)
        print("-"*80)
        
        # Compute the document similarity
        print(f"The similarity between them = {doc1.similarity(doc2):.2f}")


################################################################################
modi politics hate modiji loves india modiji want make new india corruption free terror free india hate not nature modiji yes modiji hates only enemies our country terrorists destroying terrorists camp hatedont defame our humane kind pure honest
################################################################################
--------------------------------------------------------------------------------
when modi promised “minimum government maximum governance” expected him begin the difficult job reforming the state why does take years get justice state should and not business and should exit psus and temples
--------------------------------------------------------------------------------
The similarity between them = 0.80
--------------------------------------------------------------------------------
talk all the nonsense and continue all the drama will vote for modi 
----------------------------------

  if doc2.similarity(doc1) > 0.60:


--------------------------------------------------------------------------------
’ confused who said that intellectuals should decide modi’ policies the question was which intellectuals sided with modi and similarly there’ those who’ disagree with his policies too where’ the question deciding anything
--------------------------------------------------------------------------------
The similarity between them = 0.68
--------------------------------------------------------------------------------
asked learn from how treat minority well does want what did minor 
--------------------------------------------------------------------------------
The similarity between them = 0.75
--------------------------------------------------------------------------------
for new india can vote for shri narendra modi 
--------------------------------------------------------------------------------
The similarity between them = 0.69
-------------------------------------------------------------------------

#### 7. Compute the corpus vector that is equal to the average of all the document vectors, where each document corresponds to a tweet or a row in this dataset.

In [7]:
import numpy as np
import pandas as pd
import spacy

# Load the large model
nlp = spacy.load("en_core_web_lg")

length = len(pd.Series(twitter_data['clean_text']).to_string())

for i in range(length // 10000):
    myStr = pd.Series(twitter_data['clean_text'][i * 10000 : 10000 * (i + 1)]).to_string()

    doc = nlp(myStr)

    # Compute the average of all the word/token vector in the given doc/review
    avg_word_vec= np.array([token.vector for token in doc]).mean(axis=0)
    # Obtain the doc vector
    doc_vec = doc.vector
    # test if same shape, same elements values 
    # for each 10000 tweeters ten times
    if np.array_equal(avg_word_vec, doc_vec):  
        print(f" avg_word_vec==doc_vec ? {(avg_word_vec==doc_vec).all()}")
    else:
        print(f"They don't have the same shape at all!")

They don't have the same shape at all!
They don't have the same shape at all!
They don't have the same shape at all!
They don't have the same shape at all!
They don't have the same shape at all!
They don't have the same shape at all!
They don't have the same shape at all!
They don't have the same shape at all!
They don't have the same shape at all!
They don't have the same shape at all!
They don't have the same shape at all!
They don't have the same shape at all!
They don't have the same shape at all!
They don't have the same shape at all!
They don't have the same shape at all!
They don't have the same shape at all!
 avg_word_vec==doc_vec ? True
 avg_word_vec==doc_vec ? True
 avg_word_vec==doc_vec ? True
 avg_word_vec==doc_vec ? True
 avg_word_vec==doc_vec ? True
 avg_word_vec==doc_vec ? True
 avg_word_vec==doc_vec ? True
 avg_word_vec==doc_vec ? True
 avg_word_vec==doc_vec ? True
 avg_word_vec==doc_vec ? True
 avg_word_vec==doc_vec ? True
 avg_word_vec==doc_vec ? True
 avg_word_vec==d