In [None]:
import re
' '.join(re.findall('[A-Z][^A-Z]*','WebCuration' ))


In [32]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 17})
import folium
import re
from sklearn import preprocessing
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostRegressor

def clean(file_path):
    '''
    Parameters
        file_path: takes in a file path
    Returns
        a cleaned df
    '''
    df = pd.read_csv(file_path,encoding='latin1')
    df.dropna(inplace=True)
    df.drop(['permalink','region','founded_month','founded_quarter'],axis=1,inplace=True)
    df['founded_at'] = pd.to_datetime(df['founded_at'],errors='coerce')
    df['first_funding_at']= pd.to_datetime(df['first_funding_at'],errors='coerce')
    df['last_funding_at']= pd.to_datetime(df['first_funding_at'],errors='coerce')
    df['founded_year'] = df['founded_year'].astype('int64')
    df.drop(df[df['country_code']=='CAN'].index,inplace=True)
    df['funding_total_usd'] = df[' funding_total_usd '].apply(lambda x: x.replace(' ',''))\
        .apply(lambda x: x.replace(',',''))
    df['funding_total_usd'] = df['funding_total_usd'].apply(lambda x: x.replace('-','0'))
    df['funding_total_usd'] = df['funding_total_usd'].astype('int64')
    df['market'] = df[' market '].apply(lambda x: x.replace(' ',''))
    df.drop(' market ',axis=1,inplace=True)
    df.drop(' funding_total_usd ',axis=1,inplace=True)
    df.drop('country_code',axis=1,inplace=True)
    df.drop('homepage_url',axis=1,inplace=True)
    df.drop('name',axis=1,inplace=True)
    return df

def feature_engineer(df):
    '''
    Parameters
        df: Takes in a pandas data frame
    Returns
        a data frame with engineered features
    '''
    df['time_to_funding'] = abs((df['first_funding_at']-df['founded_at']).dt.days)
    test_list = list(df['market'].value_counts().rename_axis('market').reset_index(name='counts')[:20]['market'])
    df.loc[~df["market"].isin(test_list), "market"] = "Other"
    df.dropna(inplace=True)
    return df

def create_pie_charts(df,column,column_val,target):
    '''
    Parameters
    df: Cleaned data frame
    column: column of data frame used to split data as string
    column_val: Value we are looking for in column as string
    target: The target values we are trying to predict
    Returns
    a saved image in the images folder
    '''
    column_val_title = ' '.join(re.findall('[A-Z][^A-Z]*',column_val))
    pie_df = df[df[column]==column_val][target].value_counts().rename_axis(target)\
        .reset_index(name='counts')
    pie_df['pct'] = pie_df['counts']/len(pie_df)
    labels=pie_df[target]
    fig, ax = plt.subplots(figsize=(14,7))
    ax.pie(pie_df['pct'], explode=[0,0,.15], labels=labels, \
        autopct='%1.1f%%',shadow=True, startangle=50)
    ax.axis('equal')
    ax.set_title(f'{target.capitalize()} Of {column_val_title} Market')
    plt.savefig(f'../images/{column_val}_pie.png',dpi=500)


intial_df =clean('../../../Downloads/investments_VC.csv')
clean_feat_df=feature_engineer(intial_df)

In [33]:
clean_feat_df.columns

Index(['category_list', 'status', 'state_code', 'city', 'funding_rounds',
       'founded_at', 'founded_year', 'first_funding_at', 'last_funding_at',
       'seed', 'venture', 'equity_crowdfunding', 'undisclosed',
       'convertible_note', 'debt_financing', 'angel', 'grant',
       'private_equity', 'post_ipo_equity', 'post_ipo_debt',
       'secondary_market', 'product_crowdfunding', 'round_A', 'round_B',
       'round_C', 'round_D', 'round_E', 'round_F', 'round_G', 'round_H',
       'funding_total_usd', 'market', 'time_to_funding'],
      dtype='object')

In [34]:
funding_type_df = clean_feat_df.loc[:,'seed':'product_crowdfunding'].apply(lambda x: x>0).astype('int64')

In [36]:
clean_feat_df['status'] = clean_feat_df['status'].apply(lambda x: x.replace('operating','0')).apply(lambda x: x.replace('acquired','1')).apply(lambda x: x.replace('closed','0'))
clean_feat_df['status'] = clean_feat_df['status'].astype('int64')

In [41]:
col_list = list(clean_feat_df['market'].value_counts().sort_values().rename_axis('market').reset_index(name='counts')['market'])
market_dummies = pd.get_dummies(clean_feat_df['market']).reindex(columns=col_list)
state_dummies = pd.get_dummies(clean_feat_df['state_code'])


X =market_dummies.iloc[:,:20].join(clean_feat_df['time_to_funding']).values
y=clean_feat_df['status'].values

X_train, X_test, y_train, y_test = train_test_split(X, y)

oversample = SMOTE()
X_train, y_train = oversample.fit_resample(X_train, y_train)


model = LogisticRegression(solver="lbfgs")
model.fit(X_train, y_train)
y_predict = model.predict(X_test)
y_true = y_test
print('Accuracy:', accuracy_score(y_true, y_predict))
print("Precision:", precision_score(y_test, y_predict))
print("Recall:", recall_score(y_test, y_predict))

Accuracy: 0.8629217791411042
Precision: 0.10909090909090909
Recall: 0.04419889502762431


In [42]:
X =market_dummies.iloc[:,:20].join(state_dummies.iloc[:,:50]).join(clean_feat_df['time_to_funding']).join(funding_type_df).values
y=clean_feat_df['status'].values

X_train, X_test, y_train, y_test = train_test_split(X, y)

oversample = SMOTE()
X_train, y_train = oversample.fit_resample(X_train, y_train)

rf = RandomForestClassifier(oob_score=True,max_features='sqrt', n_estimators= 50)
rf.fit(X_train, y_train)
y_predict = rf.predict(X_test)

print("Accuracy Score:", rf.score(X_test, y_test))
print("Out of Bag Score:", rf.oob_score_)
print("Precision:", precision_score(y_test, y_predict))
print("Recall:", recall_score(y_test, y_predict))


Accuracy Score: 0.85295245398773
Out of Bag Score: 0.9175511076287485
Precision: 0.165374677002584
Recall: 0.12598425196850394


In [None]:
rfc = RandomForestClassifier(n_jobs=-1,max_features= 'sqrt' ,n_estimators=50, oob_score = True) 

param_grid = { 
    'n_estimators': [50,100,150,200,250, 700],
    'max_features': ['auto', 'sqrt', 'log2']
}

CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
CV_rfc.fit(X_train, y_train)
print(CV_rfc.best_params_)

In [None]:
X =market_dummies.iloc[:,:20].join(clean_feat_df['time_to_funding']).values
y=clean_feat_df['status'].values

oversample = SMOTE()
X, y = oversample.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y)

regr = AdaBoostRegressor(n_estimators=100)
regr.fit(X_train,y_train)
y_predict = regr.predict(X_test)

In [None]:
state_dummies = pd.get_dummies(clean_feat_df['state_code'])

In [None]:
state_dummies.iloc[:,:50]

In [None]:
x = np.arange(25, 51, 1).tolist()

In [None]:
x

In [None]:




# x = df['time_to_funding'].values
# min_max_scaler = preprocessing.MinMaxScaler()
# x_scaled = min_max_scaler.fit_transform(x)
# df = pd.DataFrame(x_scaled)

In [None]:
y.sum()

In [None]:
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE
from matplotlib import pyplot
from numpy import where

# summarize class distribution
counter = Counter(y)
print(counter)
# transform the dataset
oversample = SMOTE()
X, y = oversample.fit_resample(X, y)
# # summarize the new class distribution
# counter = Counter(y)
# print(counter)
# # scatter plot of

In [None]:
X.sum()

In [None]:
y.sum()