# Data Science Framework
Author: Luc Mercier

# Configuration

## Install

## Import

In [0]:
# Import libraries
# MAIN
import numpy as np
import pandas as pd
import requests
import re
import math

# SECONDARY
# from bs4 import BeautifulSoup
#import time
#import nltk
#import matplotlib as mpl
#import datetime as dt
#import statsmodels.discrete.discrete_model as sm
#import graphviz

#from pandas.io import gbq 
#from IPython.utils import io
#from sklearn.feature_selection import RFE
#from sklearn.linear_model import LogisticRegression

#from nltk.corpus import stopwords
#from scipy.stats import pearsonr
#from IPython.display import display, HTML, Image

# Configuration
#pd.options.display.float_format = '{:,.1f}'.format
#warnings.filterwarnings('ignore')
%matplotlib inline

# http://stackoverflow.com/questions/20625582/how-to-deal-with-settingwithcopywarning-in-pandas
pd.options.mode.chained_assignment = None  # default='warn'

## Configure

In [0]:
# Seaborn settings
sns.set_context("talk", font_scale=0.8)
sns.set_style("whitegrid")
#tm_palette = 'YlOrRd_r'
#sns.color_palette(tm_palette)

## Hide Code for presentation

In [0]:
from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
Pour voir le code, cliquer <a href="javascript:code_toggle()">ici</a>.''')

In [0]:
def fill_missing_dates(df, dateMin = None, dateMax = None):
    """
    Fill missing dates in a pandas DataFrame where the index is a Datetime value (of type pandas.tseries.index.DatetimeIndex)
    df = the dataframe in question
    dateMin (optional) = First date of the range to fill. If no value, min date of dataframe is taken.
    dateMax (optional) = Last date of the range to fill. If no value, max date of dataframe is taken.
    Returns the dataframe with missing dates filled with empty data. If index is not valid, the original DataFrame is returned intact.
    """
    if type(df.index) == pd.core.indexes.datetimes.DatetimeIndex:
        if dateMin == None:
            dateMin = df.index.min()
        if dateMax == None:
            dateMax = df.index.max()
        all_days = pd.date_range(dateMin, dateMax, freq='D')
        return df.loc[all_days].fillna(0)
    else:
        return df

In [0]:
def nullify_text(text):
    """
    Cleanup text from source data to harmonize values
    Input    Output
    NaN      N/A
    Aucun    N/A
    0        N/A
    """
    
    if pd.isnull(text) or text=="Aucun" or text == "0":
        return
    else:
        return text

In [0]:
def make_bool(thing):
    """
    Takes a value and returns 1 if > 0, 0 otherwise.
    TODO: Make it also not empty string = 1, 0 otherwise
    """
    
    if thing > 0:
        return True
    else:
        return False

In [0]:
#def time_to_seconds(zeTime):
#    x = time.strptime(zeTime.split(',')[0],'%H:%M:%S')
#    return datetime.timedelta(hours=x.tm_hour,minutes=x.tm_min,seconds=x.tm_sec).total_seconds()

def time_to_seconds(zeTime):
    return datetime.timedelta(hours=zeTime.hour,minutes=zeTime.minute,seconds=zeTime.second).total_seconds()

In [0]:
def get_season(date):
    year = str(date.year)
    seasons = {'spring': pd.date_range(start='21/03/'+year, end='20/06/'+year),
               'summer': pd.date_range(start='21/06/'+year, end='22/09/'+year),
               'autumn': pd.date_range(start='23/09/'+year, end='20/12/'+year)}
    if date in seasons['spring']:
        return 'spring'
    if date in seasons['summer']:
        return 'summer'
    if date in seasons['autumn']:
        return 'autumn'
    else:
        return 'winter'
      
def add_season_simple(df):
    df['month'] = df['month'].astype("int")
    df["season"] = df['month'].apply(lambda x : (x%12+3)//3) 
    season = {1:"winter",2:"spring",3:"summer",4:"autumn"}
    df.season = df.season.map(season)
    
    return df

In [0]:
# get_dummies

In [0]:
def in_ipynb():
    try:
        cfg = get_ipython().config 
        if cfg['IPKernelApp']['parent_appname'] == 'ipython-notebook':
            return True
        else:
            return False
    except NameError:
        return False

# Import Data

## Setup

In [0]:
# Google Cloud Storage stuff
#bucket_path = "dataproc-fc4b0345-3a3c-41e2-9c9b-5ed323ebbe46-us"
#client = storage.Client()
#bucket = client.get_bucket(bucket_path)

In [0]:
# Google Cloud Storage from VM

#from google.cloud import storage
#from io import BytesIO
#client = storage.Client()
#bucket = "toxicity_input"

#blob = storage.blob.Blob("train.csv",bucket)
#content = blob.download_as_string()
#train = pd.read_csv(BytesIO(content))

# Cleanup

## Taking care of missing data

In [0]:
# Configure Imputer
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values='NaN', strategy="mean", axis=0)

# Use Imputer
#imputer = imputer.fit(X[:, 1:3])
#X[:,1:3] = imputer.transform(X[:,1:3])



In [0]:
#

## Encode Categorical Data

In [0]:
# Encode labels

from sklearn.preprocessing import LabelEncoder, OneHotEncoder

labelencoder_X = LabelEncoder()

#X[:,0] = labelencoder_X.fit_transform(X[:,0])

# Make sure the machine does not see rank in unranked features (ex. country)
#onehotencoder = OneHotEncoder(categorical_features=[0])

# Create dummy variables
#X = onehotencoder.fit_transform(X).toarray()

# Exploratory Data Analysis

## NULL values

## Basic correlations

### Boolean x, float target

# Analysis

## Feature Scaling

*   Make sure that we do not introduce artificial bias because values are not on the same scale
* Added bonus: makes processing faster.



In [0]:
from sklearn.preprocessing import StandardScaler

sc_X = StandardScaler()
#X_train = sc_X.fit_transform(X_train)
#X_test = sc_X.transform(X_test)

# What about dummy variables?
# It depends on the context. It depends on the interpretation we need.

## Split into test and train

In [0]:
from sklearn.model_selection import train_test_split

#X = 
#y =

#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

# Models and predictive

# ANN

Keras or PyTorch?

TL;DR:
Keras may be easier to get into and experiment with standard layers, in a plug & play spirit.

PyTorch offers a lower-level approach and more flexibility for the more mathematically-inclined users.

##Train ANN

(Stochastic Gradient Descent)

1. Randomly initialize weights to small numbers close to 0 (but not 0)

2. Input first observation in input layer, each feature in one input node

3. Forward propagation. Neurons are activated according to their weights.

4. Compare predicted and actual. Measure error.

5. Back propagation. Propagate error, update weights according to how much they are responsible for the error

6. Repeat 1-5 and:

> A) update weights after each observation (reinforced learning)
> B) update weights after batch of observations (batch learning)

7. When whole training set is done, redo more epochs

In [0]:
# Importing the Keras libraries and packages
import keras
from keras.models import Sequential
from keras.layers import Dense

In [0]:
# Initializing the ANN
classifier = Sequential()

# Train ANN
# Stochastic Gradient Descent
# 1. Randomly initialize weights to small numbers close to 0 (but not 0)

# 2. Input first observation in input layer, each feature in one input node

# 3. Forward propagation. Neurons are activated according to their weights.

# 4. Compare predicted and actual. Measure error.

# 5. Back propagation. 
# Propagate error, update weights according to how much they are 
# responsible for the error

# 6. Repeat 1-5 and:
#     1. update weights after each observation (reinforced learning)
#     2. update weights after batch of observations (batch learning)

# 7. When whole training set is done, redo more epochs

In [0]:
# Add input layer and first hidden layer
# TODO UPDATE CODE TO RECENT VERSION (this is Keras)

# Tip for nodes in hidden layer: 
# use average between nodes in input layer and ouput layer

# Otherwise use parameter tuning (ex. k-fold cross-validation)

# Here: input = 11, output = 1 because it's binary
# init = weights initialization (here = uniform)
# activation function = rectifier function for hidden layer ("relu")

classifier.add(Dense(units=6, init='uniform', activation='relu', input_dim=11))

# Second hidden layer
classifier.add(Dense(units=6, init='uniform', activation='relu'))

# Output layer
# one output, dependent variable is boolean
# Activation = sigmoid
classifier.add(Dense(units=1, init='uniform', activation='sigmoid'))

In [0]:
# Compile:
# Need optimizer algorithm. Here: Stochastic Gradient Descent
# (specifically: we can use Adam)
# Loss function (sum of square errors in linear regression, but logarithmic loss here)
# that we need to optimize through stochastic gradient descent
# If binary outcome: binary_crossentropy (otherwise categorical_crossentropy)
# Metrics: list of metrics to be evaluated
classifier.compile(optimizer='adam', loss='binary_crossentropy', 
                   metrics = ['accuracy'])

In [0]:
# Fit ANN to training
# batch size: number of observations after which we update weights
# epochs: number of times the whole dataset passes through the ANN
# both chosen arbitrarily

classifier.fit(X_train, y_train, batch_size = 10, nb_epoch = 100)

## Test

K-fold Cross-Validation

Split the training set into k sets, take (k-1) as training and the final as test. Repeat k times until gone through all different scenarios with the training set.