<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Read-in-Data" data-toc-modified-id="Read-in-Data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Read in Data</a></span><ul class="toc-item"><li><span><a href="#Read-in-excel-files-combining-ticker-symbols-with-the-IQID" data-toc-modified-id="Read-in-excel-files-combining-ticker-symbols-with-the-IQID-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Read in excel files combining ticker symbols with the IQID</a></span></li><li><span><a href="#Read-in-independent-variables,-join-tickers" data-toc-modified-id="Read-in-independent-variables,-join-tickers-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Read in independent variables, join tickers</a></span></li><li><span><a href="#Join-in-the-credit-rating-data" data-toc-modified-id="Join-in-the-credit-rating-data-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>Join in the credit rating data</a></span></li></ul></li></ul></div>

In [1]:
import pandas as pd 
import numpy as np 
import plotly.graph_objects as go 
from tqdm import tqdm 

## Read in Data

---
### Read in excel files combining ticker symbols with the IQID 

In [2]:
ids= pd.DataFrame() 
## Read in ticker symbols
for i in range(1, 6): 
    df = pd.read_excel('capiq_data/in_process_ids/ids {}.xlsx'.format(i),
                       engine='openpyxl')[['ID', 'IQID', 'IQ Name']]
    ids = pd.concat([ids, df]) 

## See if there are any duplicates 
print(ids.duplicated().sum()) 
## See if there are any nulls 
print(ids.isna().sum()) 
ids.head() 

0
ID         0
IQID       0
IQ Name    0
dtype: int64


Unnamed: 0,ID,IQID,IQ Name
0,MMM,IQ289194,3M Company
1,ABT,IQ247483,Abbott Laboratories
2,ABBV,IQ141885706,AbbVie Inc.
3,ABMD,IQ247589,"Abiomed, Inc."
4,ACN,IQ972190,Accenture plc


### Read in independent variables, join tickers 

In [3]:
## Join IQID for inds so we can see the Ticker and the name 

ind_df = pd.read_csv('small_df.csv') 

ind_df = ind_df.merge(ids, on = ['IQID', 'IQ Name']) 
ind_df.drop(['Unnamed: 0', 'IQ Name', 'IQID', 
            'quarter'], axis = 1, inplace = True) 

## The data is quarterly, but we need to lok at it annually. 
## This means we have to take the mean of the data for all the quarters 
ind_df = ind_df.groupby(['year', 'ID']).mean().reset_index()
                      
ind_df.head() 

Unnamed: 0,year,ID,IQ_TOTAL_REV,IQ_NI_CF,IQ_AR,IQ_GP,IQ_TOTAL_ASSETS,IQ_AP,IQ_TOTAL_LIAB,IQ_TOTAL_DEBT,IQ_CASH_FINAN,IQ_TOTAL_EQUITY,IQ_CASH_EQUIV
0,2000,PYPL,4.262073,-44.73064,0.0,-11.11638,77.265667,2.933,34.536333,0.076667,25.746083,42.729333,36.093333
1,2001,PYPL,26.07825,-26.95056,0.0,2.38625,198.586997,5.876978,126.882032,1.314882,3.11563,103.751965,101.747015
2,2002,PYPL,51.2665,0.88,0.0,16.2815,265.3055,10.945,107.277,5.615,34.5565,158.0285,89.8325
3,2006,HLT,2219.5,162.0,705.0,667.0,16737.0,1112.0,13195.5,7901.0,-771.5,3541.5,126.0
4,2007,HLT,1974.5,130.0,690.0,593.5,16127.5,1745.0,12126.5,6869.5,-666.0,4001.0,149.0


### Join in the credit rating data

We want an inner join so we only keep the companies that we have the credit rating for

In [4]:
## Join in the credit Ratings Data
credit_ratings = pd.read_csv('credit.csv')[['Year', 'TickerSymbol',
                                            'DomesticLTICRSPMthlyAvg']]
credit_ratings.rename({'DomesticLTICRSPMthlyAvg': 'rating'}, inplace = True, axis = 1)  

## Add credit ratings to df  
tot_df = credit_ratings.merge(ind_df, how = 'inner', 
                             left_on = ['Year', 'TickerSymbol'], 
                             right_on = ['year', 'ID'])
tot_df.drop(['Year', 'TickerSymbol'], axis = 1, inplace = True) 

tot_df.head() 

Unnamed: 0,rating,year,ID,IQ_TOTAL_REV,IQ_NI_CF,IQ_AR,IQ_GP,IQ_TOTAL_ASSETS,IQ_AP,IQ_TOTAL_LIAB,IQ_TOTAL_DEBT,IQ_CASH_FINAN,IQ_TOTAL_EQUITY,IQ_CASH_EQUIV
0,18.0,2010,AAL,5586.0,-98.0,738.0,1286.0,25088.0,1156.0,29033.0,11136.0,349.0,-3945.0,168.0
1,19.5,2011,AAL,5994.75,-494.75,923.5,1181.0,25366.75,1301.0,30467.25,11580.5,181.75,-5100.5,297.75
2,27.0,2012,AAL,6213.75,-469.0,1117.25,1374.25,24168.5,1697.75,32422.5,10375.0,120.75,-8254.0,433.5
3,26.166667,2013,AAL,6678.0,-458.5,1379.75,1729.5,29781.5,1703.25,36592.75,13172.5,949.75,-6811.25,766.75
4,16.833333,2014,AAL,10662.25,720.5,1930.25,3009.5,43986.5,1543.5,40943.25,17203.0,-78.75,3043.25,1160.25


In [5]:
tot_df_clean = pd.DataFrame() 



for ticker in tqdm(tot_df['ID'].unique()): 
    
    
    ## small df is all the rows with the ticker, sort by year 
    small_df = tot_df[tot_df['ID'] == ticker].sort_values(by = 'year',
                                                             ascending = True)
    
    
    ## Insert a lead rating column. This is the predictor column, 
    ## as we are trying to predict the credit rating for the next year
    small_df.insert(loc = 0, column = 'lead_rating', 
               value = small_df.rating.shift(1)) 
    
    
    ## Set the index as the year and the ticker. 
    small_df.set_index(['year', 'ID'], inplace = True) 
    
    ## Take the difference between rows. We are looking to find 
    ## differences in credit rating, so we are going to compare it to 
    ## differences in dependent variables. 
    ## We can then drop the nulls. 
    small_df = small_df.diff().dropna()  
    
    tot_df_clean = pd.concat([tot_df_clean, small_df], axis = 0) 

tot_df_clean.isna().sum()  

100%|██████████| 306/306 [00:01<00:00, 215.26it/s]


lead_rating        0
rating             0
IQ_TOTAL_REV       0
IQ_NI_CF           0
IQ_AR              0
IQ_GP              0
IQ_TOTAL_ASSETS    0
IQ_AP              0
IQ_TOTAL_LIAB      0
IQ_TOTAL_DEBT      0
IQ_CASH_FINAN      0
IQ_TOTAL_EQUITY    0
IQ_CASH_EQUIV      0
dtype: int64

In [6]:
from sklearn.model_selection import train_test_split 

train_x, test_x, train_y, test_y = train_test_split(
    tot_df_clean.drop(['lead_rating'], axis = 1) ,
    tot_df_clean['lead_rating']
)



In [7]:
from sklearn.model_selection import RepeatedStratifiedKFold 
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV

x = tot_df_clean.drop(['lead_rating'], axis = 1) 
y = tot_df_clean['lead_rating'] 

params = {'kernel':('linear', 'rbf'), 'C':[1, 10]}

## define evaluation
cv = RepeatedStratifiedKFold(n_splits = 10, 
                            n_repeats = 3)


search = GridSearchCV(estimator = SVR(), 
                      param_grid = params, verbose = 1)


In [None]:
search.fit(train_x, train_y)   

Fitting 5 folds for each of 4 candidates, totalling 20 fits


In [None]:
prediction = search.predict(test_x)
from sklearn.metrics import accuracy_score 
score = accuracy_score(prediction, test_y) 
print(score) 