# Conda enviroment: kenva

# Boiler

In [13]:
import pandas as pd 
import pandasql as ps
import matplotlib.pyplot as plt 
import numpy as np
import seaborn as sns
import scipy.stats as stats
import statsmodels.stats.api as sms

from IPython.display import display
from IPython.display import Markdown as md
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

def f():
    pd.set_option('display.max_columns', None)
    pd.set_option('display.max_rows', None)
    
def nf():
    pd.set_option('display.max_columns', None)
    pd.set_option('display.max_rows', 4)

import warnings
warnings.filterwarnings('ignore')

def dp(df, r = 5, c = None):
    from IPython.display import display
    with pd.option_context('display.max_rows', 4, 'display.max_columns', None):
        display(df)

def fg(w = 10, h = 7, dpi = 200):
    plt.rcParams['figure.figsize'] = (w,h)
    plt.rcParams['figure.dpi'] = dpi
fg()

%reload_kedro

2022-02-13 13:01:55,032 - root - INFO - ** Kedro project spaceflights
2022-02-13 13:01:55,034 - root - INFO - Defined global variable `context` and `catalog`
2022-02-13 13:01:55,046 - root - INFO - Registered line magic `run_viz`


# path

In [14]:
companies = catalog.load('companies')

2022-02-13 13:01:56,492 - kedro.io.data_catalog - INFO - Loading data from `companies` (CSVDataSet)...


In [15]:
companies

Unnamed: 0,id,company_rating,company_location,total_fleet_count,iata_approved
0,35029,100%,Niue,4.0,f
1,30292,67%,Anguilla,6.0,f
2,19032,67%,Russian Federation,4.0,f
3,8238,91%,Barbados,15.0,t
4,30342,,Sao Tome and Principe,2.0,t
...,...,...,...,...,...
77091,6654,100%,Tonga,3.0,f
77092,8000,,Chile,2.0,t
77093,14296,,Netherlands,4.0,f
77094,27363,80%,,3.0,t


In [6]:
reviews = catalog.load('reviews')

2022-02-13 11:05:58,414 - kedro.io.data_catalog - INFO - Loading data from `reviews` (CSVDataSet)...


In [7]:
reviews

Unnamed: 0,shuttle_id,review_scores_rating,review_scores_comfort,review_scores_amenities,review_scores_trip,review_scores_crew,review_scores_location,review_scores_price,number_of_reviews,reviews_per_month
0,63561,97.0,10.0,9.0,10.0,10.0,9.0,10.0,133,1.65
1,36260,90.0,8.0,9.0,10.0,9.0,9.0,9.0,3,0.09
2,57015,95.0,9.0,10.0,9.0,10.0,9.0,9.0,14,0.14
3,14035,93.0,10.0,9.0,9.0,9.0,10.0,9.0,39,0.42
4,10036,98.0,10.0,10.0,10.0,10.0,9.0,9.0,92,0.94
...,...,...,...,...,...,...,...,...,...,...
77091,4368,,,,,,,,0,
77092,2983,,,,,,,,0,
77093,69684,,,,,,,,0,
77094,21738,,,,,,,,0,


## shuttles

In [8]:
ls ../data/01_raw

[0m[01;32mcompanies.csv[0m*  [01;32mreviews.csv[0m*  [01;32mshuttles.xlsx[0m*


In [9]:
! cat ../conf/base/catalog.yml

# Here you can define all your data sets by using simple YAML syntax.
#
# Documentation for this file format can be found in "The Data Catalog"
# Link: https://kedro.readthedocs.io/en/stable/04_user_guide/04_data_catalog.html

# raw

companies:
  type: pandas.CSVDataSet
  filepath: data/01_raw/companies.csv
  layer: raw

reviews:
  type: pandas.CSVDataSet
  filepath: data/01_raw/reviews.csv
  layer: raw

shuttles:
  type: pandas.ExcelDataSet
  filepath: data/01_raw/shuttles.xlsx
  layer: raw

# intermediate

preprocessed_companies:
  type: pandas.ParquetDataSet
  filepath: data/02_intermediate/preprocessed_companies.parquet
  layer: intermediate

preprocessed_shuttles:
  type: pandas.ParquetDataSet
  filepath: data/02_intermediate/preprocessed_shuttles.parquet
  layer: intermediate

# Dataset for regressor
regressor:
  type: pickle.PickleDataSet
  filepath: data/06_models/regressor.pickle
  versioned: true
  layer: model_output


In [10]:
shuttles = catalog.load('shuttles')

2022-02-13 11:06:05,220 - kedro.io.data_catalog - INFO - Loading data from `shuttles` (ExcelDataSet)...


In [11]:
shuttles.sample(5)

Unnamed: 0,id,shuttle_location,shuttle_type,engine_type,engine_vendor,engines,passenger_capacity,cancellation_policy,crew,d_check_complete,moon_clearance_complete,price,company_id
45895,58311,Russian Federation,Type V5,Quantum,ThetaBase Services,1.0,2,moderate,1.0,t,f,"$1,429.0",14876
59945,73563,Malta,Type V5,Plasma,ThetaBase Services,3.0,6,strict,3.0,t,f,"$3,327.0",20334
29397,19535,Gambia,Type V5,Quantum,ThetaBase Services,1.0,2,flexible,1.0,t,f,"$1,442.0",24898
26763,9566,Malta,Type V5,Plasma,ThetaBase Services,1.0,2,strict,1.0,f,f,"$1,533.0",17522
63516,5405,Niue,Type V5,Plasma,ThetaBase Services,1.0,1,flexible,1.0,f,f,"$1,312.0",21559


## Data processing node

In [12]:
import pandas as pd


# string for true
def _is_true(x):
    return x == "t"

# remove % from dataset
def _parse_percentage(x):
    x = x.str.replace("%", "")
    x = x.astype(float) / 100
    return x

# replace $ with nothing and , with nothing
def _parse_money(x):
    x = x.str.replace("$", "").str.replace(",", "")
    x = x.astype(float)
    return x

# Effective nodes 

# if it is true, return "t" for iata_approved
# convert company rating to float
def preprocess_companies(companies: pd.DataFrame) -> pd.DataFrame:
    """Preprocesses the data for companies.

    Args:
        companies: Raw data.
    Returns:
        Preprocessed data, with `company_rating` converted to a float and
        `iata_approved` converted to boolean.
    """
    companies["iata_approved"] = _is_true(companies["iata_approved"])
    companies["company_rating"] = _parse_percentage(companies["company_rating"])
    
    # convert it to parquet
    return companies

# if it is true return "t" for d_check_complete
# if it is true return "t" for moon_clearence_complete
# convert price to float
def preprocess_shuttles(shuttles: pd.DataFrame) -> pd.DataFrame:
    """Preprocesses the data for shuttles.

    Args:
        shuttles: Raw data.
    Returns:
        Preprocessed data, with `price` converted to a float and `d_check_complete`,
        `moon_clearance_complete` converted to boolean.
    """
    shuttles["d_check_complete"] = _is_true(shuttles["d_check_complete"])
    shuttles["moon_clearance_complete"] = _is_true(shuttles["moon_clearance_complete"])
    shuttles["price"] = _parse_money(shuttles["price"])
    return shuttles