<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Research-and-Development-(RND)" data-toc-modified-id="Research-and-Development-(RND)-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Research and Development (RND)</a></span><ul class="toc-item"><li><span><a href="#Imports" data-toc-modified-id="Imports-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Imports</a></span></li><li><span><a href="#Resource-Check" data-toc-modified-id="Resource-Check-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Resource Check</a></span></li><li><span><a href="#Load-Data" data-toc-modified-id="Load-Data-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>Load Data</a></span></li><li><span><a href="#Basic-Stats" data-toc-modified-id="Basic-Stats-1.4"><span class="toc-item-num">1.4&nbsp;&nbsp;</span>Basic Stats</a></span></li><li><span><a href="#Correlations" data-toc-modified-id="Correlations-1.5"><span class="toc-item-num">1.5&nbsp;&nbsp;</span>Correlations</a></span></li><li><span><a href="#Regressions" data-toc-modified-id="Regressions-1.6"><span class="toc-item-num">1.6&nbsp;&nbsp;</span>Regressions</a></span></li><li><span><a href="#ML-Preprocessing" data-toc-modified-id="ML-Preprocessing-1.7"><span class="toc-item-num">1.7&nbsp;&nbsp;</span>ML Preprocessing</a></span></li></ul></li></ul></div>

# Research and Development (RND)

This notebook attempts to capture the basic steps involved in most initial research and development (RND) activities leading up to the scripting, modularization, and packaging of production-ready code. In the [Domino Data Lab Data Science Lifecycle](https://www.dominodatalab.com/resources/field-guide/managing-data-science-projects/) (a personal favorite of mine), RND aims to generate valuable insights that the business needs to make decisions.


![img](../assets/dsci-lifecycle-rnd.png)

## Imports

In [None]:
import datetime
import glob
import itertools
import json
import os
import pickle
import random
import re
from string import punctuation

In [None]:
import fuzzywuzzy
import geopandas as gpd
import humanize
import missingno as msno
import numpy as np
import pandas as pd
import recordlinkage
import scipy as sp
import statistics
import statsmodels.formula.api as smf
from pandas_profiling import ProfileReport

In [None]:
SMALL_SIZE = 10
MEDIUM_SIZE = 14
BIGGER_SIZE = 18
MAPBOX_STYLE = "dark"
MAPBOX_HEIGHT = 800
%matplotlib inline
import seaborn as sns
import matplotlib as mpl
import matplotlib.font_manager
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from matplotlib.ticker import PercentFormatter
px.set_mapbox_access_token(os.getenv("MAPBOX_TOKEN"))
matplotlib.font_manager.findSystemFonts(fontpaths=None, fontext="ttf")
plt.style.use("seaborn-colorblind")
plt.rcParams["font.family"] = "sans-serif"
plt.rcParams["font.sans-serif"] = "Open Sans"
plt.rcParams["figure.figsize"] = 15, 6
plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

In [None]:
from sklearn import __version__ as sklearn_version
from sklearn.decomposition import PCA
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import (GridSearchCV, cross_validate, learning_curve, train_test_split)
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler, scale

In [None]:
from numpy.random import default_rng
RANDOM_SEED = 51
rng = default_rng(seed)

## Load Data

In [None]:
df = pd.read_hdf("clean-data.h5", "data")

## Basic Stats

In [None]:
# column of interest
data_col = "col_a"

In [None]:
# create a model Gaussian CDF
mean, std = df[data_col].mean(), df[data_col].std()
dist = sp.stats.norm(mean, std)

# evaluate the model CDF
xs = np.linspace(df[data_col].min(), df[data_col].max())
ys = dist.cdf(xs)

In [None]:
# plot the model CDF
fig, ax = plt.subplots()
plt.plot(xs, ys, color="gray")

# ECDF
sns.ecdfplot(data=df, y=data_col, ax=ax)

In [None]:
# PMF == range of discrete random variables
probabilities = df["data_col"].value_counts(normalize=True)
sns.barplot(probabilities.index, probabilities.values)

In [None]:
# create a model Gaussian PDF
ys = dist.pdf(xs)

In [None]:
# plot the model PDF
fig, ax = plt.subplots()
plt.plot(xs, ys, color="gray")

# PDF == range of continuous random variables
sns.displot(data=df, x=data_col, kind="hist")

# KDE == smoothed range of continuous random variables
sns.displot(data=df, x=data_col, kind="kde")

## Correlations

In [None]:
# DON'T BE DUPED! Corr works for linear relationships only
corr_cols = ["col_a", "col_b", "col_c"]
corr = df[corr_cols].corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots()

# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(
    corr,
    mask=mask,
    cmap=cmap,
    vmax=0.3,
    center=0,
    square=True,
    linewidths=0.5,
    cbar_kws={"shrink": 0.5},
)

## Regressions

In [None]:
# use scipy linreg
xs = df.col_a
ys = df.col_b
fit = sp.stats.linregress(xs, ys)
print(fit)

In [None]:
# use statsmodels linreg
results = smf.ols("target ~ col_a + col_b + col_c + col_d", data=df).fit()
print(results.summary())

# use statsmodels logreg
results = smf.logit("target ~ col_a + col_b + C(col_c)", data=df).fit()
print(results.summary())

# make predictions
preds_df = pd.DataFrame(
    dict(
        col_a=np.linspace(0, 20),
        col_b=30,
        col_c=preds["col_a"] ** 2,
        col_d=preds["col_b"] ** 2,
    )
)
pred = results.predict(preds_df)

In [None]:
# use seaborn linreg
g = sns.regplot(data=df, x="col_a", y="col_b", n_boot=500, y_jitter=0.03)

# use seaborn logreg
g = sns.regplot(data=df, x="col_a", y="col_b", logistic=True, n_boot=500, y_jitter=0.03)

## ML Preprocessing

In [None]:
# define the target value
target_col = "price"

# define columns to drop
drop_cols = ["col_a", "col_b", target_col]

In [None]:
# create a train test split
TEST_SIZE = 0.3
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(columns=drop_cols),
    df[target_col],
    test_size=test_size,
    random_state=RANDOM_SEED,
)