In [1]:
import numpy as np # linear algebra
import pandas as pd
from sklearn.model_selection import train_test_split
import math
from collections import Counter
import seaborn as sns
import matplotlib.pyplot as plt  # Matlab-style plotting
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
sns.set(style='white', context='notebook', palette='deep')
import warnings
def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn #ignore annoying warning (from sklearn and seaborn)
Random_state=42
np.random.seed(0)

In [9]:
#Models import
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve
#import imputer:
from sklearn.impute import KNNImputer
#score
from sklearn.metrics import f1_score
from sklearn.ensemble import StackingClassifier

In [79]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
import seaborn as sns
import matplotlib.pyplot as plt

def getOutliersMatrix(numerical_df, threshold=1.5):
    Q1 = numerical_df.quantile(0.25)
    Q3 = numerical_df.quantile(0.75)
    IQR = Q3 - Q1

    outdata = (numerical_df < (Q1 - 1.5 * IQR)) | (numerical_df > (Q3 + 1.5 * IQR))

    for name in numerical_df.columns:
        outdata.loc[(outdata[name] == True), name] = 1
        outdata.loc[(outdata[name] == False), name] = 0

    return outdata

def imputing_numeric_missing_values(dataset, n_neighbors=10):
    numerical_column_names = dataset.select_dtypes([np.number]).columns
    knn = KNNImputer(n_neighbors=n_neighbors)
    knn_dataset = knn.fit_transform(dataset[numerical_column_names])

    dataset[numerical_column_names] = pd.DataFrame(knn_dataset, columns=numerical_column_names)
    return dataset

def draw_heatmap(df):
    plt.figure(figsize=(12, 10))
    sns.heatmap(df.corr(), annot=True, cmap='coolwarm', linewidths=0.5)
    plt.show()

def MaximizeSuccess(toMaximize: str):
    dataset = pd.read_csv("./startup data.csv", converters={'status': lambda x: int(x == 'acquired')}, parse_dates=['founded_at', 'first_funding_at', 'last_funding_at']) 
    dataset.rename(columns={'status': 'is_acquired'}, inplace=True)
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    numerical_df_1 = dataset.select_dtypes(numerics)
    outliersMatt = getOutliersMatrix(numerical_df_1)
    dataset.drop(["Unnamed: 6", "Unnamed: 0", "labels"], axis=1, inplace=True)
    comparison_column = np.where(dataset["state_code"] != dataset["state_code.1"], True, False)
    dataset = dataset.drop(columns=["state_code.1"], axis=1)
    dataset = imputing_numeric_missing_values(dataset)
    dataset['closed_at'] = dataset['closed_at'].fillna('temporary')
    dataset['closed_at'] = dataset['closed_at'].apply(lambda x: 1 if x == 'temporary' else 0)
    numerical_df_3 = dataset.select_dtypes(numerics)
    corr_matrix = numerical_df_3.corr().abs()
    upper = corr_matrix[toMaximize].fillna(0).to_dict()
    # Find features with the highest correlation
    sorted_corr = sorted(upper.items(), key=lambda item: item[1], reverse=True)
    most_correlated = [(key, value) for key, value in sorted_corr if key != toMaximize][:3]
    return most_correlated

# Exemple d'utilisation
toMaximize = 'is_top500'
most_correlated_variables = MaximizeSuccess(toMaximize)
print(f'Les 3 variables les plus corrélées avec {toMaximize} sont :')
for var, corr in most_correlated_variables:
    print(f'{var}: {corr}')

Les 3 variables les plus corrélées avec is_top500 sont :
avg_participants: 0.33158103452730747
is_acquired: 0.3106518012598112
has_roundB: 0.3051833887628251


In [74]:
dataset

Unnamed: 0,state_code,latitude,longitude,zip_code,id,city,name,labels,founded_at,closed_at,...,has_angel,has_roundA,has_roundB,has_roundC,has_roundD,avg_participants,is_top500,is_acquired,months_between_first_and_last_funding,months_between_foundation_and_first_funding
0,CA,42.358880,-71.056820,92101,6669,San Diego,Bandsintown,1,2007-01-01,1,...,1,0,0,0,0,1.0000,0,1,9.17,27.37
1,CA,37.238916,-121.973718,95032,16283,Los Gatos,TriCipher,1,2000-01-01,1,...,0,0,1,1,1,4.7500,1,1,59.27,62.37
2,CA,32.901049,-117.192656,92121,65620,San Diego,Plixi,1,2009-03-18,1,...,0,1,0,0,0,4.0000,1,1,0.00,12.57
3,CA,37.320309,-122.050040,95014,42668,Cupertino,Solidcore Systems,1,2002-01-01,1,...,0,0,1,1,1,3.3333,1,1,26.57,38.10
4,CA,37.779281,-122.419236,94105,65806,San Francisco,Inhale Digital,0,2010-08-01,0,...,1,0,0,0,0,1.0000,1,0,20.30,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
918,CA,37.740594,-122.376471,94107,21343,San Francisco,CoTweet,1,2009-01-01,1,...,0,1,0,0,0,6.0000,1,1,0.00,6.30
919,MA,42.504817,-71.195611,1803,41747,Burlington,Reef Point Systems,0,1998-01-01,0,...,0,0,1,0,0,2.6667,1,0,24.03,88.23
920,CA,37.408261,-122.015920,94089,31549,Sunnyvale,Paracor Medical,0,1999-01-01,0,...,0,0,0,0,1,8.0000,1,0,0.00,103.37
921,CA,37.556732,-122.288378,94404,33198,San Francisco,Causata,1,2009-01-01,1,...,0,1,1,0,0,1.0000,1,1,25.23,9.23
