In [13]:
#===========================================================================================================================
#                                               IDS 566 - HOMEWORK 1
#===========================================================================================================================

In [1]:
#===========================================================================================================================
#                                       Importing all the required libraries
#===========================================================================================================================

#pip install nltk       #(UNCOMMENT - For First Run Only)

import nltk
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import PorterStemmer 
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords
import string

In [2]:
#===========================================================================================================================
#                                           Reading all the input text files 
#===========================================================================================================================

file1 = open("UIC.txt", "r", encoding="utf8")
uic = file1.read()
file1.close()

file2 = open("UIUC.txt", "r", encoding="utf8")
uiuc = file2.read()
file2.close()

file3 = open("UIS.txt", "r", encoding="utf8")
uis = file3.read()
file3.close()

file4 = open("MIT.txt", "r", encoding="utf8")
mit = file4.read()
file4.close()

file5 = open("Stanford.txt", "r", encoding="utf8")
stanford = file5.read()
file5.close()

file6 = open("Tesla.txt", "r", encoding="utf8")
tesla = file6.read()
file6.close()


In [3]:
#===========================================================================================================================
#                                   Begin of Data Cleaning Process 
#===========================================================================================================================


#1. Removing Punctuations

uic      = uic.translate(str.maketrans('', '', string.punctuation))
uiuc     = uiuc.translate(str.maketrans('', '', string.punctuation))
uis      = uis.translate(str.maketrans('', '', string.punctuation))
mit      = mit.translate(str.maketrans('', '', string.punctuation))
stanford = stanford.translate(str.maketrans('', '', string.punctuation))
tesla    = tesla.translate(str.maketrans('', '', string.punctuation))

#2. Removing Apostrophe
uic      = uic.replace("'", "")
uiuc     = uiuc.replace("'", "")
uis      = uis.replace("'", "")
mit      = mit.replace("'", "")
stanford = stanford.replace("'", "")
tesla    = tesla.replace("'", "")

#3. Converting all words to Lowercase

uic      = uic.lower()
uiuc     = uiuc.lower()
uis      = uis.lower()
mit      = mit.lower()
stanford = stanford.lower()
tesla    = tesla.lower()

#4. Tokenization

wstk = WhitespaceTokenizer() 

uic      = wstk.tokenize(uic)
uiuc     = wstk.tokenize(uiuc)
uis      = wstk.tokenize(uis)
mit      = wstk.tokenize(mit)
stanford = wstk.tokenize(stanford)
tesla    = wstk.tokenize(tesla)

#5. Removing Stop Words

stop_words = stopwords.words('english')

uic      = [w for w in uic if not w in stop_words]
uiuc     = [w for w in uiuc if not w in stop_words]
uis      = [w for w in uis if not w in stop_words]
mit      = [w for w in mit if not w in stop_words]
stanford = [w for w in stanford if not w in stop_words]
tesla    = [w for w in tesla if not w in stop_words]

#6. Stemming words

porter = PorterStemmer()

uic      = [porter.stem(word) for word in uic]
uiuc     = [porter.stem(word) for word in uiuc]
uis      = [porter.stem(word) for word in uis]
mit      = [porter.stem(word) for word in mit]
stanford = [porter.stem(word) for word in stanford]
tesla    = [porter.stem(word) for word in tesla]

#===========================================================================================================================
#                                      End of Data Cleaning Process 
#===========================================================================================================================

In [4]:
#===========================================================================================================================
#                                    Calculating Jaccard Similarity between the 6 input documents 
#===========================================================================================================================

def jaccard_similarity(query, document):
    intersect_l = list(set(query) & set(document))
    union_l = list(set(query) or set(document))
    
    return len(intersect_l)/len(union_l)

Jac_sim_mit_uic=jaccard_similarity(mit, uic)
print("Jaccard similarity between MIT and UIC is "+ str(Jac_sim_mit_uic))
Jac_sim_Stan_uic=jaccard_similarity(stanford, uic)
print("Jaccard similarity between Stanford and UIC is "+ str(Jac_sim_Stan_uic))
Jac_sim_Tes_uic=jaccard_similarity(tesla, uic)
print("Jaccard similarity between Tesla and UIC is "+ str(Jac_sim_Tes_uic))
Jac_sim_uis_uic=jaccard_similarity(uis, uic)
print("Jaccard similarity between UIS and UIC is "+ str(Jac_sim_uis_uic))
Jac_sim_Tes_uiuc=jaccard_similarity(uiuc, uic)
print("Jaccard similarity between UIUC and UIC is "+ str(Jac_sim_Tes_uiuc))


Jaccard similarity between MIT and UIC is 0.18972999035679847
Jaccard similarity between Stanford and UIC is 0.263913195659783
Jaccard similarity between Tesla and UIC is 0.18243243243243243
Jaccard similarity between UIS and UIC is 0.5672131147540984
Jaccard similarity between UIUC and UIC is 0.3111979166666667


In [5]:
#===========================================================================================================================
#                       Calculating Cosine Similarity based on only Term Frequency(TF)
#===========================================================================================================================
# Scikit Learn
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

#Unlist the cleaned data obtained above for calculating cosine similarity
uic      = ','.join(str(v) for v in uic)
uiuc     = ','.join(str(v) for v in uiuc)
uis      = ','.join(str(v) for v in uis)
mit      = ','.join(str(v) for v in mit)
stanford = ','.join(str(v) for v in stanford)
tesla    = ','.join(str(v) for v in tesla)

#Merge all the documents into one document
doc = [uic,uiuc,uis,mit,stanford,tesla]

# Create the Document Term Matrix
count_vectorizer = CountVectorizer(stop_words='english')
count_vectorizer = CountVectorizer()
sparse_matrix    = count_vectorizer.fit_transform(doc)

# Convert Sparse Matrix TF(Term Frequency) to Pandas Dataframe if you want to see the word frequencies.
doc_term_matrix  = sparse_matrix.todense()
tf = pd.DataFrame(doc_term_matrix, 
                  columns=count_vectorizer.get_feature_names(), 
                  index=['uic', 'uiuc','uis','mit','stanford','tesla'])


# Computing Cosine Similarity based on tf
from sklearn.metrics.pairwise import cosine_similarity
print(cosine_similarity(tf, tf))

#Below is the Cosine Similarity Matrix based on only Term frequency-

[[1.         0.76679328 0.60162564 0.35858466 0.4763952  0.22582247]
 [0.76679328 1.         0.72802877 0.40390357 0.54561252 0.26283644]
 [0.60162564 0.72802877 1.         0.25467702 0.38466961 0.12680528]
 [0.35858466 0.40390357 0.25467702 1.         0.32399016 0.19735668]
 [0.4763952  0.54561252 0.38466961 0.32399016 1.         0.24465725]
 [0.22582247 0.26283644 0.12680528 0.19735668 0.24465725 1.        ]]


In [6]:
#===========================================================================================================================
#                             Calculating Cosine Similarity based on TF-IDF
#===========================================================================================================================

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


vectorizer = TfidfVectorizer()

#The below variable tfidf contains the TF-IDF values for each term of these documents
tfidf = vectorizer.fit_transform(doc)
words = vectorizer.get_feature_names()
similarity_matrix = cosine_similarity(tfidf)

print(similarity_matrix)

#Below is the Cosine Similarity Matrix based on TF-IDF-

[[1.         0.64338228 0.46427893 0.2393719  0.29727417 0.12533567]
 [0.64338228 1.         0.59480755 0.28967765 0.36147868 0.15902449]
 [0.46427893 0.59480755 1.         0.16042576 0.23253759 0.06401812]
 [0.2393719  0.28967765 0.16042576 1.         0.18176578 0.09928483]
 [0.29727417 0.36147868 0.23253759 0.18176578 1.         0.11260325]
 [0.12533567 0.15902449 0.06401812 0.09928483 0.11260325 1.        ]]
