<a href="https://colab.research.google.com/github/susanalima/CV-ONTOLOGY/blob/master/ECAC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [707]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import itertools
import os
import csv

from datetime import datetime
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedShuffleSplit, cross_validate
from sklearn import svm
from sklearn import metrics
from sklearn.metrics import confusion_matrix, f1_score, make_scorer, balanced_accuracy_score, recall_score, precision_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder


from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [708]:
#!unzip "/content/drive/My Drive/ECAC/data.zip" -d "/content/drive/My Drive/ECAC/data"

In [709]:
mainDir = "/content/drive/My Drive/ECAC/"
dataDir = mainDir + "data/publicData/"
predictionsDir = mainDir + "predictions/"

# directory for loan train data
# format: loan_id;account_id;date;amount;duration;payments;status
loanTrainDir = dataDir + "loan_train.csv"

# directory for loan test data
# format: loan_id;account_id;date;amount;duration;payments
loanTestDir = dataDir + "loan_test.csv"

# directory for account data
# format: "account_id";"district_id";"frequency";"date"
accountDir = dataDir + "account.csv"

# directory for district data
# format: code ;name ;region;no. of inhabitants;no. of municipalities with inhabitants < 499 ;no. of municipalities with inhabitants 500-1999;no. of municipalities with inhabitants 2000-9999 ;no. of municipalities with inhabitants >10000 ;no. of cities ;ratio of urban inhabitants ;average salary ;unemploymant rate '95 ;unemploymant rate '96 ;no. of enterpreneurs per 1000 inhabitants ;no. of commited crimes '95 ;no. of commited crimes '96 
districtDir = dataDir + "district.csv"

In [710]:
# read data from csv file in directory (dir), by default the delimiter is ';'
# returns the data
def read_from_file(dir, delimiter=';'):
  # Create a dataframe from csv
  df = pd.read_csv(dir, delimiter=delimiter)

  c = df.select_dtypes("object").columns.tolist()
  print(c)
  df= pd.get_dummies(df, columns=c)

  # User list comprehension to create a list of lists from Dataframe rows
  data = [list(row) for row in df.values]
  # Insert Column names as first list in list of lists
  #data.insert(0, df.columns.to_list())
  return data, df

# writes data to file in specified directory
def write_to_file(dir, content):
  with open(dir, "w") as f:
      writer = csv.writer(f)
      writer.writerows(content)

In [711]:
def list_int(lst):
  return [int(item) for item in lst]

def list_of_lists_int(lst):
  return [list_int(item) for item in lst]

# get list with sublist of elements of every list in the list of lists
# lst list of lists [[],[],[]]
# start: start position of sublist
# end: end position of sublist
def get_elems_list_of_lists(lst, start, end):
  return [item[start:end] for item in lst]

# create one single list from list of lists
def list_merge(a):
  return list(itertools.chain.from_iterable(a))

In [712]:
# splits data in two sets, train and test (suffles the data)
def split_data_rand(data, labels, size=0.2):
  X_train, X_test, y_train, y_test = train_test_split(data, labels, shuffle=True, test_size=size) 
  return X_train, X_test, y_train, y_test 

In [713]:
# runs default svm algorithm
def svm_run(xTrain, xTest, yTrain, weights="balanced", kernel="linear", average="macro"):

  # normalize features between 0 and 1
  xTrain = svm_normalize_features(xTrain)
  xTest = svm_normalize_features(xTest)

  svm_model = svm.SVC(kernel=kernel, class_weight=weights) 
  #Train the model using the training sets
  svm_model.fit(xTrain, yTrain)
  #Test the model using the testing sets
  yPred = svm_model.predict(xTest)
  return yPred


# runs default rf algorithm
def rf_run(xTrain, xTest, yTrain, weights="balanced", average="macro"):
  # Instantiate model with 50 decision trees
  rf_model = RandomForestClassifier(n_estimators = 50, class_weight=weights)
  #Train the model using the training sets
  rf_model.fit(xTrain, yTrain)
  #Test the model using the testing sets
  yPred = rf_model.predict(xTest)
  return yPred

# runs default mlp algorithm
def mlp_run(xTrain, xTest, yTrain, weights="balanced", average="macro"):
  mlp_model = MLPClassifier(
      solver="lbfgs", alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1
  )
  mlp_model.fit(xTrain, yTrain)
  yPred = mlp_model.predict(xTest)
  return yPred


In [714]:
# uses MinMaxScaler to normalize the features between 0 and 1 (required for svm)
def svm_normalize_features(features):
  #normalize between 0 and 1
  scaler = MinMaxScaler() 
  features = scaler.fit_transform(features)
  return features


In [715]:
# joins loans ids and predictions in list of lists
def add_id_to_predictions(ids, predictions):
  res = []
  for i in range(len(ids)):
    res.append([ids[i], predictions[i]])
  return res

# adds header to csv file as request for submission
def add_header_to_results(results):
  results.insert(0, ['Id','Predicted'])
  return results

In [716]:
# first implementation of cross validation
# https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_validate.html
# https://scikit-learn.org/stable/modules/cross_validation.html
def model_cross_validation(model, xTrain, yTest, cv, scoring):
  return cross_validate(model, xTrain, yTest, cv=cv, scoring=scoring, return_estimator=True)


# cross validation for svm
def svm_cross_validation(xTrain, yTrain, cv, scoring):
  xTrain = svm_normalize_features(xTrain)
  clf = svm.SVC(kernel='linear', class_weight="balanced")
  return model_cross_validation(clf, xTrain, yTrain, cv, scoring)


# cross validation for svm
def rf_cross_validation(xTrain, yTrain, cv, scoring):
  clf = RandomForestClassifier(n_estimators=50, class_weight="balanced")
  return model_cross_validation(clf, xTrain, yTrain, cv, scoring)

In [717]:
accountData, accountDataDF = read_from_file(accountDir)
districtData, districtDataDF = read_from_file(districtDir)

['frequency']
['name ', 'region', "unemploymant rate '95 ", "no. of commited crimes '95 "]


In [718]:
# read loan train data from corresponding directory
loanTrainData, loanTrainDataDF = read_from_file(loanTrainDir)

# read loan test data from # read loan train data from corresponding directory
loanTestData, loanTestDataDF = read_from_file(loanTestDir)
loanTestData = get_elems_list_of_lists(loanTestData, 0, 6)
#loanTestData = list_of_lists_int(loanTestData)

[]
[]


In [719]:
loanTrainDataDF = pd.merge(loanTrainDataDF, accountDataDF, on='account_id', how='inner')
loanTrainDataDF = loanTrainDataDF.drop(['status', 'loan_id', 'account_id'], axis=1)
#loanTrainDataDF = pd.merge(loanTrainDataDF, districtDataDF, left_on='district_id', right_on='code ', how='inner')
#loanTrainDataDF = loanTrainDataDF.drop(['code '], axis=1)

loanTestDataDF = pd.merge(loanTestDataDF, accountDataDF, on='account_id', how='inner')
loanTestDataDF = loanTestDataDF.drop(['status', 'loan_id', 'account_id'], axis=1)
#loanTestDataDF = pd.merge(loanTestDataDF, districtDataDF, left_on='district_id', right_on='code ', how='inner')
#loanTestDataDF = loanTestDataDF.drop(['code '], axis=1)

In [720]:
# loan train ids
loanTrainIds = get_elems_list_of_lists(loanTrainData, 0, 1)
loanTrainIds = list_merge(loanTrainIds)

# loan train "features"
loanTrainX = loanTrainDataDF.values

# loan train labels
loanTrainY = get_elems_list_of_lists(loanTrainData, 6, 7)
loanTrainY = list_merge(loanTrainY)

# loan test ids
loanTestIds = get_elems_list_of_lists(loanTestData, 0, 1)
loanTestIds = list_merge(loanTestIds)

# loan test "features"
loanTestX = loanTestDataDF.values

In [721]:
svmPredictions = svm_run(loanTrainX, loanTestX, loanTrainY)


rfPredictions = rf_run(loanTrainX, loanTestX, loanTrainY)
mlpPredictions = mlp_run(loanTrainX, loanTestX, loanTrainY)

svmResults = add_id_to_predictions(loanTestIds, svmPredictions)
svmResults = add_header_to_results(svmResults)

rfResults = add_id_to_predictions(loanTestIds, rfPredictions)
rfResults = add_header_to_results(rfResults)

mlpResults = add_id_to_predictions(loanTestIds, mlpPredictions)
mlpResults = add_header_to_results(mlpResults)


In [722]:
write_to_file(predictionsDir + "svm.csv", svmResults)
write_to_file(predictionsDir + "rf.csv", rfResults)
write_to_file(predictionsDir + "mlp.csv", mlpResults)

In [723]:
#cv = 10

nSplits=10  
cv = StratifiedShuffleSplit(n_splits=nSplits, test_size=0.2, random_state=0)

scoring = ['accuracy', 'balanced_accuracy']

svm_scores = svm_cross_validation(loanTrainX, loanTrainY, cv, scoring )
print("svm cross validation scores")
print(svm_scores['test_accuracy'])
print(svm_scores['test_balanced_accuracy'])

rf_scores = rf_cross_validation(loanTrainX, loanTrainY, cv, scoring )
print("\nrf cross validation scores")
print(rf_scores['test_accuracy'])
print(rf_scores['test_balanced_accuracy'])

svm cross validation scores
[0.51515152 0.57575758 0.63636364 0.59090909 0.62121212 0.59090909
 0.59090909 0.59090909 0.53030303 0.48484848]
[0.53216374 0.56725146 0.60233918 0.57602339 0.54678363 0.57602339
 0.57602339 0.43567251 0.44736842 0.51461988]

rf cross validation scores
[0.86363636 0.81818182 0.83333333 0.86363636 0.84848485 0.87878788
 0.84848485 0.86363636 0.87878788 0.86363636]
[0.54678363 0.47368421 0.48245614 0.5        0.5380117  0.60233918
 0.49122807 0.5        0.60233918 0.5       ]
