In [1]:
import import_ipynb ## pip install this to load jupyter notebooks as source files
import pandas as pd
import numpy as np
import sqlite3
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn import decomposition
from sklearn.compose import ColumnTransformer


In [2]:
### Load and combine datasets using a merge function#
def load_and_combine(path1,path2,path3):
    con=sqlite3.connect(path1)
    file1=pd.read_sql_query('SELECT * FROM devices',con)
    file2=pd.read_csv(path2)
    file3=pd.read_parquet(path3,engine='pyarrow')
    ## merge 'on' not specified sinc eit is not relevant here. 
    frame=file2.merge(file1,on='uid_s')
    frame=frame.merge(file3,on='uid_s')
    return frame

In [3]:
## filter data, we are interested in those who completed the tutorial###
#Use this to show where the Na's are. 
def filter_data(Frame):
    Frame=Frame[Frame['game_stats_tutorial_complete']==1]
    ## Show na's per column 
    nlls=Frame.isnull().any()
    ## count number of nas##
    sum_nlls=Frame.isna().sum()
    return Frame, sum_nlls


In [4]:
def split_column_types(frame):
    ## groups different column types 
    column_types=frame.columns.to_series().groupby(frame.dtypes).groups
    types={i.name: v for i, v in column_types.items()}
    ## Selects category and float type#
    cat_types=[i for i in types['object']]
    cluster_types=[i for i in types['float64']]
    cluster_types.append('total_spend')
    return  types, cat_types, cluster_types



In [5]:
## Displays unique category values to see effect of OneHotEncoding
def show_unique_values(Frame,cat_types):
    for col in cat_types:
        print(Frame[col].unique())
        
    

In [6]:
## Counts the disticnt values for each category
### too many distinct values for these categories. This will be an issue when One hot Encoding. 
##### We can also use nltk and reg expression to reduce the amount of variability by grouping up.. consider this if time permits
def count_unique(Frame,cat_types):
    Frame[cat_types].nunique()

In [7]:
#### columns are selected to be dropped based on count_unique function##
def drop_columns(Frame,columns_drop):
    Frame=Frame.drop(columns_drop,axis=1)
    return Frame

In [8]:
#### Data is standardized and na's are removed
def restructure_data(Frame,cluster_types):
    Frame.dropna(inplace=True)
    scaler=StandardScaler()
    Frame[cluster_types]=scaler.fit_transform(Frame[cluster_types])
    Frame=pd.DataFrame(Frame,columns=Frame.columns)
    Frame.set_index('uid_s',inplace=True)
    return Frame




In [9]:
### one hot encoding takes place here, old categorical columns are removed and original dataframe is joined
def encode_and_concat(dataframe, feature):
    dummies = pd.get_dummies(dataframe[feature])
    res = pd.concat([dataframe, dummies], axis=1)
    res = res.drop(feature, axis=1)
    return(res) 


