In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import os
import json
import folium
import datetime, time
import sklearn
import networkx as nx
from community import community_louvain
from networkx.algorithms.community.centrality import girvan_newman
from scipy import stats
from scipy.stats import ttest_ind
import statsmodels.stats.api as sms
import sklearn
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, KFold, cross_val_predict, cross_val_score, GridSearchCV
from sklearn import datasets, linear_model
from sklearn.feature_selection import SelectKBest, f_classif, chi2
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn import svm
plt.rcParams['figure.figsize'] = [15, 8]

  from numpy.core.umath_tests import inner1d


# Statistics

In [2]:
#Statistics

def descr_stats(df):
    df.describe()
    
def correlation(x, y, corr = 'spearman'):
    
    if corr == 'spearman':
        return stats.spearmanr(x,y)
    if corr == 'pearson':
        return stats.pearsonr(x,y)
    
def conf_interval_95(data):
    # confidence interval for 95% confidence equals mean +- (1.96*std /sqrt(sample_size))
    mean = data.mean()
    std = data.std()
    n = len(data)
    interval = np.array([mean - (1.96 * std/np.sqrt(n)), mean + (1.96 * std/np.sqrt(n))])
    return interval

def LMplot (set, xaxis, yaxis, degree):
    '''
    Function fits linear regression model and calculates corresponding statistics.
    Data and regression line is then plotted
     :input: pandas dataframe with columns
     :output: print statistics and plot linear regression
     :return: -
     '''
    x= set[xaxis]
    y= set[yaxis]

    #calculate linear regression with polynomials of specified degree
    fit = np.polyfit(x, y, degree)

    #plot model and existing data
    plt.figure(figsize=(18, 10))
    plt.plot(x, y, 'o',ms=2, label='original data')
    plt.plot(np.sort(x),np.polyval(fit,np.sort(x)),'-r', label='fitted line')
    plt.legend()
    plt.xlabel(xaxis)
    plt.ylabel(yaxis)
    plt.show()
    
def LMplot (set, xaxis, yaxis,):
    '''
    Function fits linear regression model and calculates corresponding statistics.
    Data and regression line is then plotted
     :input: pandas dataframe with columns named 'SelfEmployed' and 'IncomePerCap'
     :output: print statistics and plot linear regression
     :return: -
     '''
    
    x= set[xaxis]
    y= set[yaxis]

    #calculate linear regression
    slope, intercept, r_value, p_value, std_err = stats.linregress(x,y)

    #plot model and existing data
    plt.figure(figsize=(18, 10))
    plt.plot(x, y, 'o',ms=2, label='original data')
    plt.plot(x, intercept + slope*x, 'r', label='fitted line')
    plt.legend()
    plt.xlabel(xaxis)
    plt.ylabel(yaxis)
    plt.show()

    #print statistics
    print('''The formula for the fitted linear Model is: y = %f + %f * x \n \n r-value = %f
 p-value = %f \n Standard_error: %f''' % (intercept, slope,r_value, p_value, std_err)) 

# Machine Learning

In [3]:
#Machine Learning

def make_binary(df, column, key):
    df[column] = df[column].map(lambda x: 1 if x==key else 0)
    return df

def convert_categorical(df):
    df = pd.get_dummies(df)
    return df

def array_lambda():
    #prediction for class = 1
    prediction = np.asarray([1 if pred[1] >= 0.5 else 0 for pred in prediction])

def standardize(x):
    #standardize data
    mean_x = np.mean(x)
    x = x - mean_x
    std_x = np.std(x)
    x = x / std_x
    return x, mean_x, std_x

def split_train_data(data, labels, fraction):
    #Split data into training and test set
    X_train_df = data.sample(frac=fraction,random_state=100)
    X_test_df = data.drop(X_train_df.index)
    
    #Retrieve the lables based on the indeces of the X_train and X_test
    y_train = labels.iloc[list(X_train_df.index),].values
    y_test = labels.iloc[list(X_test_df.index),].values
    
    #Get only values of the data set
    X_train = X_train_df.values
    X_test = X_test_df.values
    return X_train, X_test, y_train, y_test

def confusion_mtx(y_test, y_pred, class_):
    #Get confusion matrix
    conf_mtx = pd.crosstab(y_test, y_pred)
    print(conf_mtx)
    
    #Extract the values from the confusion matrix
    true_pos = conf_matrix[1][1]
    true_neg = conf_matrix[0][0]
    false_pos = conf_matrix[0][1]
    false_neg = conf_matrix[1][0]
    
    #Manually compute the different performance parameters
    accuracy = (true_pos + true_neg) / conf_mtx.values.sum()
    precision = true_pos / (true_pos + false_pos)
    recall = true_pos / (true_pos + false_neg) 
    fscore = 2*precision*recall / (precision + recall)
    
    print('For Class {}:'.format(class_),'\n','Accuracy is {}'.format(accuracy),'\n','Precision is {}'.format(precision),'\n',
             'Recall is {}'.format(recall),'\n','F1-score is {}'.format(fscore),'\n','\n')

def Logistic_reg(X_train, y_train, X_test):
    lr = linear_model.LogisticRegression()
    lr.fit(X_train, y_train)
    prediction = lr.predict_proba(X_test)
    #prediction for class = 1 with threshold 0.5
    prediction_one = np.asarray([1 if pred[1] >= 0.5 else 0 for pred in prediction])
    return prediction_one

def mask_array_xtest(X_test, selector):
    X_test_k = np.asarray([row[selector.get_support()] for row in X_test])
    return X_test_k

def get_features(df, selector):
    features = df.columns[selector.get_support()]
    return features

def RandomForest_GridSearch(X_train, y_train, cv=5):
    rfc = RandomForestClassifier()

    param_grid = { 
        'n_estimators': np.linspace(1, 100, 20, endpoint=True, dtype=int),
        'max_depth': np.linspace(1, 20, 10, endpoint=True, dtype=int)
        #'max_features': np.linspace(1, 50, 25, endpoint=True, dtype=int)
        }

    CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= cv)
    CV_rfc.fit(X_train, y_train)
    print(CV_rfc.best_params_)

def RandomForest(X_train, y_train, X_test, estimators, depth):
    rfc = RandomForestClassifier(
            n_estimators= estimators, 
            max_depth=depth
            )
    rfc.fit(X_train, y_train)
    #feature importance
    feature_importance = rfc.feature_importances_
    #prediction
    prediction = rfc.predict(X_test)
    return prediction