In [3]:
# General packages
import numpy as np
import pandas as pd
import os
import re
from functools import reduce

# Machine learning packages
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel

### Create a class that performs cosine similarity

In [10]:
class similarities():
    '''
    Class that performs the different similarities between clean files of a given company
    Cosine similarity on DTF / Cosine similarity on TF-IDF / Jaccard Similarity...
    '''
    def __init__(self, comp_code):
        self.comp_code = comp_code


    def shape_data(self):
        '''
        Returns the company's clean data in a dictionnary shape
        '''
        comp_clean_path = './data/{}/clean/'.format(self.comp_code)
        comp_data_dic = {}

        if not os.path.isdir(comp_clean_path):
            raise Exception("Company's clean files don't exist: check folder!")

        for f in os.listdir(comp_clean_path):
            # If the file is not a company file
            if f[0] == '.':
                continue
            
            # Extract the year
            year = int(f.split('_')[1])

            f_path = os.path.join(comp_clean_path, f)
            f_txt = open(f_path, "r")
            f_str = f_txt.read()

            # Add the string to the company's list
            comp_data_dic[year] = f_str

            # Close the file
            f_txt.close()

        return(comp_data_dic)

    
    def match_years(self, year):
        '''
        Function that takes as input the starting interesting year
        Returns a list of tuples with years to match: [(year_1, year_2), (year_2, year_3), ...]
        '''
        comp_clean_path = './data/{}/clean/'.format(self.comp_code)
        comp_data_dic = {}
        years = []

        for f in os.listdir(comp_clean_path):
            # If the file is not a company file 
            if f[0] == '.':
                continue
            # Extract the year
            year_doc = f.split('_')[1]
            years.append(int(year_doc))
        
        comp_years = sorted([y for y in years if y >= int(year)])
        
        # If there are no years to match
        if len(comp_years) == 0 or len(comp_years) == 1:
            return([])

        match_years = []
        for i in range(len(comp_years[:-1])):
            match_years.append((comp_years[i], comp_years[i+1]))
        
        return(match_years)



    def jaccard(self, year): 
        '''
        Simple implementation of the Jaccard similarity (intersection / union of two texts)
        '''
        # Extract the company's data from the folder
        comp_data_dic = self.shape_data()
        year = int(year)

#         if year not in list(comp_data_dic.keys()):
#             raise Exception('Year not in folder! Check folder or check year format (YY)')

        # Years to match
        match = self.match_years(year)
        
        if len(match) == 0:
            return(pd.DataFrame({'year_1':[], 'year_2':[], 'jaccard_sim':[]}))

        # Loop over the other texts and find the similarities
        jaccard_sim = []
        year1, year2 = [], []

        for y1, y2 in match:
            # Extract the text for both years
            text1, text2 = comp_data_dic[y1], comp_data_dic[y2]

            # List the unique words in a document
            words1 = set(text1.split()) 
            words2 = set(text2.split())
            
            # Find the intersection of words list of doc1 & doc2
            intersection = words1.intersection(words2)

            # Find the union of words list of doc1 & doc2
            union = words1.union(words2)
            
            # Add the Jaccard similarity to the list
            jaccard_sim.append(float(len(intersection)) / len(union))

            # Add years to the lists
            year1.append(y1)
            year2.append(y2)

        # Turn the results into a dataframe
        final_df = pd.DataFrame({'year_1': year1, 'year_2': year2, 'jaccard_sim': [np.round(jac, 3) for jac in jaccard_sim]})

        return(final_df)
    

    def dtf_cosine(self, year):
        '''
        Returns the dtf implementation of cosine similarity for the desired year with all the other years
        '''
        # Extract the company's data from the folder
        comp_data_dic = self.shape_data()
        year = int(year)

        # Function that creates the full document 
        union = lambda t1, t2: set(t1.split()).union(set(t2.split()))

        # Function that counts the number of occurences of words from text in the full document
        def DTF(doc, text): 
            res = dict.fromkeys(doc,0)
            for word in text.split():
                res[word]+=1 
            return res

#         if year not in list(comp_data_dic.keys()):
#             raise Exception('Year not in folder! Check folder or check year format (YY)')

        # Years to match
        match = self.match_years(year)
        
        if len(match) == 0:
            return(pd.DataFrame({'year_1':[], 'year_2':[], 'dtf_cosine_sim':[]}))

        # Loop over the other texts and find the cosine similarities
        cosine_sim = []
        year1, year2 = [], []

        for y1, y2 in match:
            # Extract the text for both years
            text1, text2 = comp_data_dic[y1], comp_data_dic[y2]

            full_text = union(text1, text2)
            D1_TF = DTF(full_text, text1)
            D2_TF = DTF(full_text, text2)
            df = pd.DataFrame([D1_TF,D2_TF])
            df1 = df.loc[0,:]
            df2 = df.loc[1,:]
            cosine_sim.append(np.dot(df1,df2)/(np.linalg.norm(df1)*np.linalg.norm(df2)))

            # Add the years
            year1.append(y1)
            year2.append(y2)

        # Turn the results into a dataframe
        final_df = pd.DataFrame({'year_1': year1, 'year_2': year2, 'dtf_cosine_sim': [np.round(cos, 2) for cos in cosine_sim]})

        return(final_df)



    def tfidf_cosine(self, year):
        '''
        Returns the tf-idf implementation of cosine similarity for the desired year with all the other years
        '''
        # Extract the company's data from the folder
        comp_data_dic = self.shape_data()
        year = int(year)

        # Get the matching years
        match = self.match_years(year)
        
        if len(match) == 0:
            return(pd.DataFrame({'year_1':[], 'year_2':[], 'tfidf_cosine_sim':[]}))
            
        
        match_u = list(set([y1 for y in match for y1 in y]))

        # Apply the tf-idf vectorization
        tfidf_data = TfidfVectorizer().fit_transform([comp_data_dic[y] for y in match_u])

        # Put this data in a dictionnary
        comp_data_tfidf_dic = {year_doc: tfidf_data[i] for i, year_doc in enumerate(match_u)}
                
#         if year not in list(comp_data_dic.keys()):
#             raise Exception('Year not in folder! Check folder or check year format (YY)')

        cosine_sim = []
        year1, year2 = [], []

        for y1, y2 in match:
            # Extract the text for both years
            text1, text2 = comp_data_tfidf_dic[y1], comp_data_tfidf_dic[y2]

            # Perform the cosine sim
            cosine = linear_kernel(text1, text2).flatten()[0]

            # Add everything to the results
            cosine_sim.append(cosine)
            year1.append(y1)
            year2.append(y2)

        # Turn the results into a DataFrame
        final_df = pd.DataFrame({'year_1': year1, 'year_2': year2, 'tfidf_cosine_sim': [np.round(cos, 2) for cos in cosine_sim]})

        return(final_df)
            

    def euclidian(self, year):
        '''
        Returns the euclidian distance with tf-idf vectorization for the desired year with all the other years
        '''
        # Extract the company's data from the folder
        comp_data_dic = self.shape_data()
        year = int(year)

        # Get the matching years
        match = self.match_years(year)
        
        if len(match) == 0:
            return(pd.DataFrame({'year_1':[], 'year_2':[], 'euclidian_dist':[]}))
        
        match_u = list(set([y1 for y in match for y1 in y]))

        # Apply the tf-idf vectorization
        tfidf_data = TfidfVectorizer().fit_transform([comp_data_dic[y] for y in match_u])

        # Put this data in a dictionnary
        comp_data_tfidf_dic = {year_doc: tfidf_data[i] for i, year_doc in enumerate(match_u)}
                
#         if year not in list(comp_data_dic.keys()):
#             raise Exception('Year not in folder! Check folder or check year format (YY)')

        euc_dist = []
        year1, year2 = [], []

        for y1, y2 in match:
            # Extract the text for both years
            text1, text2 = comp_data_tfidf_dic[y1], comp_data_tfidf_dic[y2]

            # Perform the cosine sim
            euc = euclidean_distances(text1, text2).flatten()[0]

            # Add everything to the results
            euc_dist.append(euc)
            year1.append(y1)
            year2.append(y2)


        # Turn the results into a DataFrame
        final_df = pd.DataFrame({'year_1': year1, 'year_2': year2, 'euclidean_dist': [np.round(e, 2) for e in euc_dist]})

        return(final_df)


    def compute_sim(self, year, *sims):
        '''
        Function that takes as input similarities / distances to compute
        Return a DataFrame with the results
        '''
        sim_match = {
            'jaccard': lambda y: self.jaccard(y),
            'dtf': lambda y: self.dtf_cosine(y),
            'tfidf': lambda y: self.tfidf_cosine(y),
            'euclidian': lambda y: self.euclidian(y)
        }

        df_list = []
        for sim in sims:
            if sim not in list(sim_match.keys()):
                raise Exception('Check the similarities you ask for! Can only be in this list: [jaccard, dtf, tfidf, euclidian]')

            df_list.append(sim_match[sim](year))

        # Merge all the dfs together
        final_df = reduce(lambda x, y: pd.merge(x, y, on = ['year_1', 'year_2']), df_list)

        # Order the final df
        final_df.year_1 = final_df.year_1.transform(int)
        final_df.year_2 = final_df.year_2.transform(int)

        return(final_df.sort_values(by = 'year_2'))


In [None]:
# Create the Apple object
# apple_sim = similarities('AAPL')
# apple_sim.shape_data().keys()

dict_keys([18, 19, 11, 13, 12, 15, 17, 16, 21, 20])

In [None]:
# Get the desired distances / similarities in a dataframe
# apple_df = apple_sim.compute_sim(12, 'jaccard', 'dtf', 'tfidf', 'euclidian')