# LUMEN DataSci 2021

## Data loading

### Imports

In [None]:
# #%install_ext https://raw.github.com/cpcloud/ipython-autotime/master/autotime.py
# import autotime
# %load_ext autotime
#

In [None]:
import pickle
import os
import time
import numpy as np
import pandas as pd
from sklearn import model_selection
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
import pickle
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
from kneed import KneeLocator
import scipy

### Dataset

In [None]:
data_path = "LUMEN0.pkl"
with open(data_path, 'rb') as f:
    dataset = pickle.load(f)

## Data cleaning

In [None]:
# List of all the features
for feature_name in dataset.keys():
    print(feature_name)

In [None]:
dataset.drop(
    ['CustomerID',
     'Item Code',
     'Invoice #',
     'Order #',
     'Invoiced price (TX)'],
    axis=1, inplace=True)

In [None]:
# Invoiced price, should be > 0
dataset.drop(dataset[dataset['Invoiced price'] <= 0].index, inplace=True)

In [None]:
# Share of NaN's per feature
na_share = dataset.isna().mean()
na_share

`'Price last modified date in the ERP'` has a lot of missing values **(73.8%)**, since a lot of prices never changed. Therefore, we put the oldest change date to all the `NaN`s

In [None]:
column = 'Price last modified date in the ERP'
last_modified = dataset[column]
oldest = last_modified[last_modified.notna()].min()
dataset.loc[last_modified.isna(), column] = oldest
assert not dataset[column].isna().any()

Drop all the features (column) which have more than 9% of `Nan`'s

**TODO: impute them using regressor**

In [None]:
dataset.drop(na_share[na_share > 0.09].index, axis=1, inplace=True)
na_share = dataset.isna().mean()
na_share

## Drop all rows with NaN's
**TODO: impute then using regressor**

In [None]:
dataset.dropna(inplace=True)
dataset.shape

In [None]:
#column = 'Manufacturing Location Code'
#vc = dataset[column].value_counts()
#vc

In [None]:
#for region in ['North America', 'Asia', 'Europe']:
#    x = dataset[column].loc[dataset['Manufacturing Region'] == region].unique()
#    s = set(x)
#    print(len(s), s)

In [None]:
#import plotly.express as px
#column = 'Manufacturing Location Code'
#df = dataset[[column, 'GM%']]#.loc[dataset['Manufacturing Region'] == 'Asia']


#fig = px.box(df, x=column, y='GM%')
#fig.show()
#df

In [None]:
from datetime import datetime
import calendar

def transform_order_date(order_date):
    order_date = order_date.apply(datetime.fromisoformat)
    day_of_week = []
    day_of_month = []
    day_of_year = []
    for date in order_date:
        dow = date.weekday()
        date_day = date.day-1
        doy = date.timetuple().tm_yday - 1 # starts from 1
        month_length = calendar.monthrange(date.year, date.month)[1]
        year_length = 365 if not calendar.isleap(date.year) else 366
        day_of_week.append(2*np.pi*dow / 7)
        day_of_month.append(2*np.pi*date_day / month_length)
        day_of_year.append(2*np.pi*doy / year_length)
    dow = np.array(day_of_week)
    dom = np.array(day_of_month)
    doy = np.array(day_of_year)
    return np.cos(dow), np.sin(dow), np.cos(dom), np.sin(dom), np.cos(doy), np.sin(doy)

In [None]:
#date = dataset['Order Date']
#print(date)
#print(date)
#dow_x, dow_y, dom_x, dom_y, doy_x, doy_y = transform_order_date(date)
#dataset['dow_x'] = dow_x
#dataset['dow_y'] = dow_y
#dataset['dom_x'] = dom_x
#dataset['dom_y'] = dom_y
#dataset['doy_x'] = doy_x
#dataset['doy_y'] = doy_y
#dataset.drop(['Order Date'], axis=1, inplace=True)
#dataset.shape

In [None]:
#import plotly.express as px
#column = 'Manufacturing Region'
#df = negative_gm[column]#, 'GM%']]#.loc[dataset['Manufacturing Region'] == 'Asia']


#fig = px.histogram(df, x=column)
#fig.show()


In [None]:
#import plotly.express as px
#column = 'Manufacturing Region'
#df = negative_gm[column]#, 'GM%']]#.loc[dataset['Manufacturing Region'] == 'Asia']


#fig = px.histogram(dataset, x='GM%', log_y=True)
#fig.show()

In [None]:
# List of all the features
# for feature_name in dataset.keys():
#     print(feature_name, dataset[feature_name].dtype)
# df = pd.get_dummies(dataset, 
#                     columns=['Manufacturing Region', 'Intercompany', 'Customer industry', 'Product family'])

In [None]:
df=dataset
df.columns

In [None]:
a = [
    'Manufacturing Region',
    'Intercompany',
    'Customer industry',
    'Customer Region',
    'Top Customer Group',
    'Product family',
    'Make vs Buy'
]
df.drop(a, axis=1, inplace=True)

In [None]:
# remove extra features for clustering
#dataset.drop(
#    ['Manufacturing Location Code',
#     'Intercompany',
#     'Invoice #',
#     'Order #',
#     'Invoiced price (TX)'],
#    axis=1, inplace=True)

cluster_df = dataset.drop(
    ['Manufacturing Location Code',
     'Born on date',
     'Customer First Invoice Date',
     'Born on date',
     'Invoice Date',
     'Invoice Line #',
     'Order Date',
     'Order Line #',
     'Invoiced qty (shipped)'],
    axis=1, inplace=False)

In [None]:
# cluster_df = pd.get_dummies(cluster_df)
# cluster_df

In [None]:
noto = (cluster_df['GM%'] > 1) | (cluster_df['GM%'] < -1)
cluster_df = cluster_df[~noto]
for feature_name in ['Ordered qty', 'Invoiced price', 'Cost of part']:
    cluster_df.drop(index=cluster_df[(cluster_df[feature_name] <= 0)].index, inplace=True)
    feature = cluster_df[feature_name]
    feature = np.log(feature)
    feature = (feature - feature.mean()) / feature.std()
    cluster_df[feature_name] = feature
cluster_df

In [None]:
1-len(cluster_df) / len(dataset)

In [None]:
#import plotly.express as px
#column = 'Invoiced price'

#fig = px.histogram(df, x=column, log_y=True)
#fig.show()

#cluster_df=cluster_df[0:1000]

In [None]:
# from sklearn.cluster import OPTICS

# cluster = OPTICS(n_jobs=-1,max_eps=5)
# cluster.fit(cluster_df)

In [None]:
%%time



kmeans = KMeans(n_clusters=12)
kmeans.fit(cluster_df)


In [None]:
kmeans.labels_

In [None]:
sse = []
r=range(1, 11)

for k in r:
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(cluster_df)
    sse.append(kmeans.inertia_)
    
plt.style.use("fivethirtyeight")
plt.plot(r, sse)
plt.xticks(r)
plt.xlabel("Number of Clusters")
plt.ylabel("SSE")
plt.show()


In [None]:
kl = KneeLocator(range(1, 11), sse, curve="convex", direction="decreasing")


In [None]:
kl.elbow