In [163]:
import yfinance as yf
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 17})
import folium
import re
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier,AdaBoostRegressor
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

class DataFrame(object):

    def __init__(self,file_path):
        """Create df object
        Parameters
        ----------
        file_path: takes in a file path to raw data
        """
        self.file_path = file_path
        
    def clean(self):
        '''
        Parameters
            self: self
        Returns
            a cleaned df
        '''
        df = pd.read_csv(self.file_path,encoding='latin1')
        df.dropna(inplace=True)
        df.drop(['permalink','region','founded_month','founded_quarter'],axis=1,inplace=True)
        df['founded_at'] = pd.to_datetime(df['founded_at'],errors='coerce')
        df['first_funding_at']= pd.to_datetime(df['first_funding_at'],errors='coerce')
        df['last_funding_at']= pd.to_datetime(df['first_funding_at'],errors='coerce')
        df['founded_year'] = df['founded_year'].astype('int64')
        df.drop(df[df['country_code']=='CAN'].index,inplace=True)
        df['funding_total_usd'] = df[' funding_total_usd '].apply(lambda x: x.replace(' ',''))\
            .apply(lambda x: x.replace(',',''))
        df['funding_total_usd'] = df['funding_total_usd'].apply(lambda x: x.replace('-','0'))
        df['funding_total_usd'] = df['funding_total_usd'].astype('int64')
        df['market'] = df[' market '].apply(lambda x: x.replace(' ','')) 
        df.drop([' market ',' funding_total_usd ','country_code','homepage_url','name','city','last_funding_at', 'round_A', 'round_B',
       'round_C', 'round_D', 'round_E', 'round_F', 'round_G', 'round_H','category_list'],axis=1,inplace=True)
        return df

def feature_engineer(df):
    '''
    Parameters
        df: Takes in a pandas data frame
    Returns
        a data frame with engineered features
    '''
    df['time_to_funding'] = abs((df['first_funding_at']-df['founded_at']).dt.days)
    test_list = list(df['market'].value_counts()\
        .rename_axis('market').reset_index(name='counts')[:20]['market'])
    df.loc[~df["market"].isin(test_list), "market"] = "Other"
    df.dropna(inplace=True)
    return df

def add_finance(symbol,df):
    '''
    Parameters
        symbol: stock ticker
        df: data frame to add 
    Returns
        a data frame with engineered features
    '''
    finance = yf.Ticker(symbol)
    fin_df = finance.history(period="max")
    fin_df['avg']=(fin_df['Open']+fin_df['Close'])/2
    fin_df.drop(['Open', 'High', 'Low', 'Close', 'Volume', 'Dividends', 'Stock Splits'],axis=1,inplace=True)
    fin_df['first_funding_at'] = fin_df.index
    
    no_finance_data = list(clean_feat_df[clean_feat_df['first_funding_at']<fin_df['first_funding_at'].min()].index)
    df.drop(no_finance_data,inplace=True)
    df.set_index('first_funding_at')
    df = df.merge(fin_df,on='first_funding_at')
    return df

In [337]:
    intial_df = DataFrame('../../../Downloads/investments_VC.csv').clean()
    clean_feat_df=feature_engineer(intial_df)
    clean_feat_df=add_finance('^VIX',clean_feat_df)
    
    #change funding from to dollars to ones and zeros
    funding_type_df = clean_feat_df.loc[:,'seed':'product_crowdfunding']\
        .apply(lambda x: x>0).astype('int64')
    funding_type_df.drop('undisclosed',axis=1,inplace=True)

    #make dummies
    col_list = ['Advertising', 'Analytics', 'Biotechnology', 'CleanTechnology',
       'Consulting', 'CuratedWeb', 'E-Commerce', 'Education',
       'EnterpriseSoftware', 'Finance', 'Games', 'Hardware+Software',
       'HealthCare', 'HealthandWellness', 'Manufacturing', 'Mobile',
       'Security', 'Semiconductors', 'SocialMedia', 'Software','Other']
    market_dummies = pd.get_dummies(clean_feat_df['market']).reindex(columns=col_list)
    state_dummies = pd.get_dummies(clean_feat_df['state_code'])

    #set targets as ones and zeros
    clean_feat_df['status'] = clean_feat_df['status'].apply(lambda x: x.replace('operating','0'))\
        .apply(lambda x: x.replace('acquired','1')).apply(lambda x: x.replace('closed','0'))
    clean_feat_df['status'] = clean_feat_df['status'].astype('int64')

In [355]:
    #set X and Y, test train split and SMOTE                           
    X =market_dummies.iloc[:,:20].join(state_dummies.iloc[:,:50])\
        .join(clean_feat_df['time_to_funding']).join(clean_feat_df['avg']).join(funding_type_df).values
    y=clean_feat_df['status'].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)
    oversample = SMOTE()
    X_train, y_train = oversample.fit_resample(X_train, y_train)

    #build logisitic model
    log_model = LogisticRegression(solver="lbfgs",max_iter=300)
    log_model.fit(X_train, y_train)
    y_predict_log = log_model.predict(X_test)

    #build random forest model
    rf = RandomForestClassifier(max_features=42, n_estimators=100)
    rf.fit(X_train, y_train)
    y_predict_rf = rf.predict(X_test)

In [356]:
def eval_model(model, X_test, y_test,threshold):
    predicted_proba = model.predict_proba(X_test)
    preds = (predicted_proba [:,1] >= threshold).astype('int')
#     preds = model.predict(X_test)
    acc = accuracy_score(y_test, preds)
    rec = recall_score(y_test, preds)
    prec = precision_score(y_test, preds)
    f1 = f1_score(y_test, preds)
    print('===ACCURACY===')
    print(acc)
    print('===RECALL===')
    print(rec)
    print('===PRECISION===')
    print(prec)
    print('===F1===')
    print(f1)

In [357]:
eval_model(rf,X_test, y_test,.6)

===ACCURACY===
0.8898698884758365
===RECALL===
0.03051643192488263
===PRECISION===
0.17567567567567569
===F1===
0.052


In [358]:
thresh_list= np.linspace(0,1,101)[::-1]
profit = []
for val in thresh_list:
    threshold = val
    predicted_proba = rf.predict_proba(X_test)
    preds = (predicted_proba [:,1] > threshold).astype('int')
    tn, fp, fn, tp = confusion_matrix(y_test, preds).ravel()
    invest = 100000
    yearly_return = .27
    hold_per=8
    profit.append(round((tp/(tn+fp+fn+tp))*(((1+yearly_return)**8)*invest)-((fp/(tn+fp+fn+tp))*invest),2))

In [359]:
profit

[0.0,
 0.0,
 0.0,
 157.24,
 134.0,
 134.0,
 134.0,
 134.0,
 110.77,
 110.77,
 110.77,
 110.77,
 110.77,
 64.3,
 41.07,
 41.07,
 41.07,
 41.07,
 41.07,
 -5.4,
 -5.4,
 -5.4,
 -28.64,
 -28.64,
 82.13,
 192.9,
 169.67,
 99.97,
 30.26,
 141.03,
 409.04,
 292.87,
 403.64,
 333.94,
 491.18,
 421.47,
 328.54,
 660.84,
 655.44,
 585.74,
 492.8,
 626.81,
 487.4,
 732.18,
 819.71,
 1041.25,
 1303.86,
 1682.63,
 1857.7,
 2009.54,
 1975.5,
 2150.58,
 2075.47,
 2338.08,
 2536.38,
 2391.58,
 2270.0,
 2736.32,
 2684.45,
 3261.53,
 3681.37,
 4322.76,
 4329.79,
 4342.22,
 4215.24,
 4583.22,
 4363.31,
 4911.75,
 4598.91,
 4850.71,
 5486.69,
 5604.49,
 5920.6,
 6236.7,
 6395.57,
 7031.55,
 6800.84,
 7163.41,
 7497.35,
 7552.47,
 7205.59,
 7085.64,
 7210.47,
 7819.45,
 7426.1,
 7627.66,
 7171.63,
 6937.15,
 6684.83,
 6352.0,
 5779.81,
 5511.28,
 4817.51,
 3607.19,
 2618.41,
 2865.92,
 2223.51,
 221.08,
 -943.29,
 -5826.76,
 -11178.69]

In [360]:
np.argsort(profit)

array([100,  99,  98,  23,  22,  21,  20,  19,   0,   2,   1,  28,  14,
        15,  16,  17,  18,  13,  24,  27,   9,   8,  12,  10,  11,   4,
         6,   7,   5,  29,   3,  26,  25,  97,  31,  36,  33,  32,  30,
        35,  42,  34,  40,  39,  41,  38,  37,  43,  44,  45,  46,  47,
        48,  50,  49,  52,  51,  96,  56,  53,  55,  54,  94,  58,  57,
        95,  59,  93,  60,  64,  61,  62,  63,  66,  65,  68,  92,  69,
        67,  70,  91,  71,  90,  72,  73,  89,  74,  88,  76,  87,  75,
        81,  77,  86,  80,  82,  84,  78,  79,  85,  83])

In [362]:
thresh_list[83]

0.17