In [7]:
import pandas as pd
from pandas.api import types
from six import string_types
import joblib

In [8]:
def read_data(path, save_file=True, return_file=True):
    """
    Function to open .csv files

    Parameters
    -----------
    path        : str   - Dataset path
    save_file   : bool  - If true, will save dataframe file in pickle
    return_file : bool  - If true, will do data return              
    
    Return
    -------
    data    : pandas dataframe  - dataframe from pandas environment
    """
    # Read data
    data = pd.read_csv(path)

    
    #Bagian dump ini bisa tidak diikutkan
    if save_file:
        joblib.dump(data, "data.pkl")
    
    if return_file:
        return data



In [9]:
def split_input_output(dataset, target_column, save_file=True, return_file=True):
    """
    Function to separate dataset to input & output (based on target_column)

    Parameters
    -----------
    dataset         : pandas dataframe  - Dataset
    target_column   : str               - nama kolom yang jadi output
    save_file       : bool              - Apabila true, akan melakukan saving file dataframe dalam pickle
    return_file     : bool              - Apabila true, akan melakukan return data              
    
    Return
    -------
    input_df        : pandas dataframe  - dataframe input
    output_df       : pandas dataframe  - dataframe output
    """
    output_df = dataset[target_column]
    input_df = dataset.drop([target_column], axis=1)    # drop kolom target

    #Bagian dump ini bisa tidak diikutkan
    if save_file:
        joblib.dump(input_df, "input_df.pickle")
        joblib.dump(output_df, "output_df.pickle")

    if return_file:
        return input_df, output_df

In [10]:
def split_train_validation(input_df, output_df, save_file=True, return_file=True, test_size=0.2):
    """
    Fungsi untuk memisahkan dataset training menjadi training dataset & validation dataset
    untuk kebutuhan validasi, dengan perbandingan test_size = validation_dataset/total_dataset

    Parameters
    -----------
    input_df    : pandas dataframe  - dataframe input
    output_df   : pandas dataframe  - dataframe output
    save_file   : bool              - Apabila true, akan melakukan saving file dataframe dalam pickle
    return_file : bool              - Apabila true, akan melakukan return data  

    Return
    -------
    X_train           : pandas dataframe  - dataframe training input
    X_validation      : pandas dataframe  - dataframe validation input
    y_train           : pandas dataframe  - dataframe training output
    y_validation      : pandas dataframe  - dataframe validation output
    """
    # Copy data biar tidak terjadi aliasing
    X = input_df.copy()
    y = output_df.copy()

    # Split data
    # Random state = 123 untuk mempermudah duplikasi riset
    X_train, X_validation, y_train, y_validation = train_test_split(X, y, 
                                                                    test_size=test_size,
                                                                    random_state=123)

    #Bagian dump ini bisa tidak diikutkan
    if save_file:
        joblib.dump(X_train, "X_train.pickle")
        joblib.dump(X_validation, "X_validation.pickle")
        joblib.dump(y_train, "y_train.pickle")
        joblib.dump(y_validation, "y_validation.pickle")

    if return_file:
        return X_train, X_validation, y_train, y_validation

In [11]:
def rename_columns(df):

    col_names = {
    "Weekly Cases" : "WeekCase",
    "Weekly Cases per Million" : "WeekCasePerMil",
    "Weekly Deaths" : "WeekDeath",
    "Weekly Deaths per Million" : "WeekDeathPerMil",
    "Total Vaccinations" : "TotalVac",
    "People Vaccinated" : "PeopleVac",
    "People Fully Vaccinated" : "PeopleFullVac",
    "Total Boosters" : "TotalBoost",
    "Daily Vaccinations" : "DailyVac",
    "Total Vaccinations per Hundred" : "TotalVacPerHun",
    "People Vaccinated per Hundred" : "PeopleVacPerHun",
    "People Fully Vaccinated per Hundred" : "PeopleFullVacPerHun",
    "Total Boosters per Hundred" : "TotalBoostPerHun",
    "Daily Vaccinations per Hundred" : "DailyVacPerHun",
    "Daily People Vaccinated" : "DailyPeopleVac",
    "Daily People Vaccinated per Hundred" : "DailyPeopleVacPerHun",
    "Next Week's Deaths" : "NWD"}
    df = df.rename(columns = col_names)
    return df

In [5]:
train_data_path = 'https://raw.githubusercontent.com/fcitra/PMDS_Grup_G/main/train.csv'
test_data_path = 'https://raw.githubusercontent.com/fcitra/PMDS_Grup_G/main/test.csv'

In [12]:
train_data = read_data(train_data_path)
test_data = read_data(test_data_path)

In [13]:
train_data = rename_columns(train_data)
test_data = rename_columns(test_data)