In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import math
import os

from sklearn import preprocessing

pd.set_option('display.max_columns', 200)

import warnings
warnings.filterwarnings('ignore')

# General statistics analysis functions


In [None]:
# Get Basic Column Statistics values - max, min, count, average, standard deviation, first, last

def getCount(df, col): return df[col].count()
def getMin(df, col):   return df[col].min()
def getMax(df, col):   return df[col].max()
def getAvg(df, col):   return df[col].mean()
def getFirst(df):          return df.iloc[0]
def getLast(df):           return df.iloc[-1]
def getStd(df, col):   return df[col].std()

# Helper Functions

In [5]:
# Changes type of attribute of a specific attribute on a collumn
def data_trans(dataset, attribute, type):
    dataset[attribute] = pd.to_numeric(dataset[attribute], errors='coerce').fillna(0).astype(type)

# Plot-related Functions


In [4]:
def scatterplot(dataset):
    sns.set_theme(style="ticks")
    sns.pairplot(dataset, hue="status")
    plt.show()
   
def heatmap(dataset):
    # Creating correlation matrix
    dataset_corr = dataset.corr().abs()
    print(dataset_corr)

    sns.heatmap(dataset_corr, cmap='RdYlGn_r', linewidths=0.5, annot=True)
    plt.yticks(rotation= 0)
    plt.xticks(rotation=90)
    # Display the Pharma Sector Heatmap
    plt.show()
    
def boxplot(datasetCols, datasetList):
   
    fig = plt.figure(figsize =(10, 7))
    ax = fig.add_subplot(111)

    # Creating axes instance
    bp = ax.boxplot(datasetList, patch_artist = True,
                    notch ='True', vert = 0)

    colors = ['#0000FF', '#00FF00',
            '#FFFF00', '#FF00FF']
    
    for patch, color in zip(bp['boxes'], colors):
        patch.set_facecolor(color)

    # changing color and linewidth of
    # whiskers
    for whisker in bp['whiskers']:
        whisker.set(color ='#8B008B',
                    linewidth = 1.5,
                    linestyle =":")
    
    # changing color and linewidth of
    # caps
    for cap in bp['caps']:
        cap.set(color ='#8B008B',
                linewidth = 2)
    
    # changing color and linewidth of
    # medians
    for median in bp['medians']:
        median.set(color ='red',
                linewidth = 3)
    
    # changing style of fliers
    for flier in bp['fliers']:
        flier.set(marker ='D',
                color ='#e7298a',
                alpha = 0.5)
        
    # x-axis labels
    ax.set_yticklabels(datasetCols)
    
    # Adding title
    plt.title("Box Plot")
    
    # Removing top axes and right axes
    # ticks
    ax.get_xaxis().tick_bottom()
    ax.get_yaxis().tick_left()

    plt.show()

def barplotcount(dataset, column_name):
    sns.countplot(x = column_name, data = dataset)
    plt.show()


# Handling Missing Values

In [3]:
def nulls(dataset, jcolumns):
    for col in jcolumns:
        if(col in dataset.columns.tolist()):
            for row in range(0,len(dataset)):
                if dataset.iloc[row][col] is None:
                    dataset.drop(row, axis=0, inplace=True)
        else:
            print('Error: column ' + col + ' does not exist in dataset')
    dataset.fillna(0) 
    return dataset

# Feature Selection

# Load Datasets


In [2]:
    account = pd.read_csv("C:/Users/ASUS/Desktop/FEUP/4ano/1Semestre/AC/Projeto-AC/ficheiros_competicao_dev/account.csv")
    cards = pd.read_csv("C:/Users/ASUS/Desktop/FEUP/4ano/1Semestre/AC/Projeto-AC/ficheiros_competicao_dev/card_dev.csv")
    clients = pd.read_csv("C:/Users/ASUS/Desktop/FEUP/4ano/1Semestre/AC/Projeto-AC/ficheiros_competicao_dev/client.csv")
    disps = pd.read_csv("C:/Users/ASUS/Desktop/FEUP/4ano/1Semestre/AC/Projeto-AC/ficheiros_competicao_dev/disp.csv")
    districts = pd.read_csv("C:/Users/ASUS/Desktop/FEUP/4ano/1Semestre/AC/Projeto-AC/ficheiros_competicao_dev/district.csv")
    loans = pd.read_csv("C:/Users/ASUS/Desktop/FEUP/4ano/1Semestre/AC/Projeto-AC/ficheiros_competicao_dev/loan_dev.csv")
    trans = pd.read_csv("C:/Users/ASUS/Desktop/FEUP/4ano/1Semestre/AC/Projeto-AC/ficheiros_competicao_dev/trans_dev.csv")

    dataset = account.rename({'frequency' : 'frequency_account', 'date' : 'date_of_creation'}, axis=1)

    dataset = dataset.merge(loans, how='outer') 
    dataset.drop('loan_id', inplace=True, axis=1)
    dataset = dataset.rename({'date' : 'date_of_loan', 'duration' : 'duration_loan', 'payments' : 'payments_loan', 'status' : 'status_loan'}, axis=1)

    dataset = dataset.merge(pd.DataFrame(trans.groupby('account_id').size(), columns=['n.of trans']), left_on='account_id', right_index=True, how="outer")
    dataset = dataset.merge(pd.DataFrame(trans[trans['operation']=='credit in cash'].groupby('account_id').size(), columns=['credit_cash']), right_index=True, left_on='account_id', how="outer")
    dataset = dataset.merge(pd.DataFrame(trans[trans['operation']=='credit card withdrawal'].groupby('account_id').size(), columns=['cc_wdw']), right_index=True, left_on='account_id', how="outer")
    dataset = dataset.merge(pd.DataFrame(trans[trans['operation']=='withdrawal in cash'].groupby('account_id').size(), columns=['wdw_cash']), right_index=True, left_on='account_id', how="outer")
    dataset = dataset.merge(pd.DataFrame(trans[trans['operation']=='collection from another bank'].groupby('account_id').size(), columns=['coll_bank']), right_index=True, left_on='account_id', how="outer")
    dataset = dataset.merge(pd.DataFrame(trans[trans['operation']=='remittance to another bank'].groupby('account_id').size(), columns=['rem_bank']), right_index=True, left_on='account_id', how="outer")
    # dataset = dataset.rename({'amount_x' : 'amount_loan', 'amount_y' : 'amount_trans', 'date' : 'date_trans', 'account' : 'account partner'}, axis=1)

    dataset = dataset.merge(disps,  how='outer')
    dataset = dataset.rename({'type_x' : 'trans_type', 'type_y' : 'disp_type'}, axis=1)
    
    dataset = dataset.merge(cards, how='outer')
    dataset = dataset.rename({'type' : 'card_type'}, axis=1)
    dataset.drop('disp_id', inplace=True, axis=1)

    dataset = dataset.merge(clients, left_on="client_id", right_on="client_id", how='outer') 
    dataset = dataset.rename({'district_id_x' : 'district_id_account', 'district_id_y' : 'district_id_client'}, axis=1)
    dataset = dataset.merge(districts, left_on = "district_id_client", right_on="district_id", how='outer' ) 

# Processing Each Individual Table

In [None]:
def loans_handler(loans_df):
    

In [None]:
def district_handler(district_df):
    data_trans(district_df, 'no. of commited crimes \'95', 'int64')