# Similarity Score Project 

Use Word2Vec pre-trained model and your Webhose dataset, to identify 100 most similar titles to any one chosen title.

###  Google Word2Vec Model 

### Loads the downloaded pre-trained Google Word2Vec model from your computer

In [1]:
#!pip install gensim --user

In [1]:
import gensim, operator
from scipy import spatial
import numpy as np
from gensim.models import KeyedVectors

model_path = 'C:/Users/tramh/github/data'

In [2]:
def load_wordvec_model(modelName, modelFile, flagBin):
    print('Loading ' + modelName + ' model...')
    model = KeyedVectors.load_word2vec_format(model_path + modelFile, binary=flagBin)
    print('Finished loading ' + modelName + ' model...')
    return model

In [3]:
model_word2vec = load_wordvec_model('Word2Vec', 'GoogleNews-vectors-negative300.bin.gz', True)

Loading Word2Vec model...
Finished loading Word2Vec model...


### Import webhose data

In [4]:
#Reads JSON objects of newsfeeds into list of dictionaries
import json
json_data=open("C:/data/webhose_apple.json").readlines()

In [5]:
# Prints the number of newsfeeds (JSON objects) in the collection
newsfeeds_read = []
for line in json_data:
    newsfeeds_read.append(json.loads(line))
len(newsfeeds_read)

10800

In [6]:
import random

title_list = [x['title'] for x in newsfeeds_read]
article_title = random.choice(title_list)

### Checking Similarity

In [7]:
def vec_similarity(input1, input2, vectors):
    term_vectors = [np.zeros(300), np.zeros(300)]
    terms = [input1, input2]
        
    for index, term in enumerate(terms):
        for i, t in enumerate(term.split(' ')):
            try:
                term_vectors[index] += vectors[t]
            except:
                term_vectors[index] += 0
        
    result = (1 - spatial.distance.cosine(term_vectors[0], term_vectors[1]))
    if result is 'nan':
        result = 0
        
    return result

In [8]:
# function checks whether the input words are present in the vocabulary for the model
def vocab_check(vectors, words):
    
    output = list()
    for word in words:
        if word in vectors.vocab:
            output.append(word.strip())
            
    return output

In [9]:
# function calculates similarity between two strings using a particular word vector model
def calc_similarity(input1, input2, vectors):
    s1words = set(vocab_check(vectors, input1.split()))
    s2words = set(vocab_check(vectors, input2.split()))
    
    output = vectors.n_similarity(s1words, s2words)
    return output

In [10]:
sim_list=[]

for i in title_list:
    try:
        sim = calc_similarity(article_title, i, model_word2vec)
        sim_list.append(sim)
    except:
        #sim_list.append(0, 'ERROR ZERO DIV '+i)
        sim_list.append(0)

### 100 most similar titles in a descending order of similarity scores

In [11]:
import pandas as pd
df = pd.DataFrame(list(zip(title_list, sim_list)),columns =['Title', 'Similarity'])
most_similar = df.sort_values(['Similarity'], ascending=0)
most_similar[:100]

Unnamed: 0,Title,Similarity
5255,"New iMac, iPad Pro and 16 inch MacBook Pro com...",1.000000
6077,"Rumor: iPad Pro, 16-inch MacBook Pro, iMac wit...",0.855856
6851,New 13 inch MacBook Pro gets unboxed again (Vi...,0.855090
159,"Apple’s 16-inch MacBook Pro, 2018’s 12.9-inch ...",0.815411
137,"Deals: iPads back for $249, Apple Watch 5 $299...",0.802628
2033,"May 2020 in review: 'iPhone 12' leaked, MacBoo...",0.788718
1266,"Latest 13-inch MacBook Pro hits new low, 10.2-...",0.783056
7286,Apple iPad Pro (2020) review: Is it a notebook...,0.779337
10665,"MacBook Pro $300 off, Apple Watch $179, iMac P...",0.778373
2147,May 2020 in Review: at last the MacBook Pro 13...,0.776781
