# Final analysis of large cap companies and first results

In [6]:
%pip install -U sec-edgar-downloader | grep -v 'already satisfied' 
%pip install yfinance | grep -v 'already satisfied'
%pip install pandas_datareader | grep -v 'already satisfied' 
%pip install cleantext | grep -v 'already satisfied'
%pip install ipynb | grep -v 'already satisfied'

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import ipynb
import pickle
import os

%run A_data_extraction.ipynb
%run I_data_pre_processing.ipynb
%run II_models.ipynb

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [7]:
def comparisons(tkrs, n, plot = False):
    '''
    Function that takes as input a list of tickers and a number of years to perform the comparison on
    Performs comparisons for all companies in the list and saves the scores as a pickle file
    '''
    sim_dic = {}
    for tick in tkrs:
        print('Working on {}'.format(tick))
        # Download raw files and create folders for each company (A_data_extraction.ipynb)
        files_downloader("10-K", str(tick), n = n)
        
        # Clean the previously downloaded files and store them in the clean folder (I_data_pre_processing.ipynb)
        # Create the compnay object
        comp_files = files_cleaning(tick)
        # Create the clean files
        comp_files.write_clean_files()
        
        # Compute similarities (II_models.ipynb)
        print('Computing similarities...')
        comp_sim = similarities(tick)
        comp_df = comp_sim.compute_sim('01', 'tfidf')
        sim_dic[tick] = comp_df
        
        if plot:
            sns.kdeplot(data = comp_sim, x = 'tfidf_cosine_sim')
            plt.show()
            
        
        # Save the company's similarities
        save_path = './data/{}/{}_similarities.pickle'.format(tick, tick)
        with open(save_path, 'wb') as handle:
            pickle.dump(comp_df, handle, protocol=pickle.HIGHEST_PROTOCOL)
            
        print('Done with {}!'.format(tick))
        print('--------------------------')
    
    # Save the similarities as pickle file
    save_path = './data/similarities.pickle'
    with open(save_path, 'wb') as handle:
        pickle.dump(sim_dic, handle, protocol=pickle.HIGHEST_PROTOCOL)
    

In [8]:
def clean_all():
    '''
    Function that removes all files to rerun next step
    '''
    for tick in os.listdir('./data'):
        clean_path = os.path.join('./data', tick, 'clean')
        raw_path = os.path.join('./data', tick, 'raw')
        
        if os.path.exists(raw_path):
            for f in os.listdir(raw_path):
                if not f[0] == '.':
                    os.remove(os.path.join(raw_path, f))  

        if os.path.exists(clean_path):
            for f in os.listdir(clean_path):
                if not f[0] == '.':
                    os.remove(os.path.join(clean_path, f))   

In [9]:
clean_all()

In [10]:
tickers_df = pd.read_csv('./data/stock_tickers.csv', sep = ';')[['ticker']]
tickers = list(tickers_df.ticker.values)

In [11]:
comparisons(tickers[1:60], n = 20)

Working on MSFT
Extracting textual content...
Cleaning the text (might take long)...
Writing the clean files...
Computing similarities...
Done with MSFT!
--------------------------
Working on GOOG
Extracting textual content...
Cleaning the text (might take long)...
Writing the clean files...
Computing similarities...
Done with GOOG!
--------------------------
Working on TSLA
Extracting textual content...
Cleaning the text (might take long)...
Writing the clean files...
Computing similarities...
Done with TSLA!
--------------------------
Working on UNH
Extracting textual content...
Cleaning the text (might take long)...
Writing the clean files...
Computing similarities...
Done with UNH!
--------------------------
Working on JNJ
Extracting textual content...
Cleaning the text (might take long)...
Writing the clean files...
Computing similarities...
Done with JNJ!
--------------------------
Working on FB
Extracting textual content...
Cleaning the text (might take long)...
Writing the clea