# Exploratory data analysis (EDA)

In [238]:
import os
import pandas as pd
import plotly.express as px
from sklearn import decomposition 
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
import numpy as np      

## Data pipeline

The data pipeline, in the conventional sense, follows the ETL process. Here, "Extraction", is perfomed after you find the answers to the following questions. 
1. What data are you using?
2. What are your inputs?
3. Whata are your labels?
4. How long will you spend getting the data?

Then worry about:
1. Reproducibility
2. Meta-data
3. Data lineage
4. Balanced data sets. 

All of which will come in handy when you transform your code into a product. 

For this tutorial experiment we will use a structured data set. The data set is aboout "sales". To begin with, all attributes in the data set are inputs excluding price which is going to be our label or target attribute. To keep it simple the data is stored in a .csv file which is neither updated nor changed in any way. In practice, you would use a script (or whatever works for you) to download, scrape, etc your data at a specificed frequency, depending on your use cases. For example: daily data updates for batch jobs or minute data for data-stream jobs. 

In [347]:
class Data(object):
    def __init__(self):
        self.root_dir = os.path.join(os.getcwd(), os.pardir) #directory where data is located. In our case the data is in the project root directory
        self.data_dir = os.path.join(self.root_dir, 'data', 'raw')
        self.raw = None #stores raw data
        
    def get(self, file_name):
        data = pd.read_csv(os.path.join(self.data_dir, file_name))
        self.raw = data
        return data.info()
        
    def plot(self, feature=None, target=None, color=None):
        assert self.raw is not None, "Use get(file_name) method to import data"
        fig = px.scatter(x=self.raw[feature], y=self.raw[target], color=color)
        fig.show()
        
    #Unsupervised learning algortihms
    
    def pca(self, features, target):
        #feature engineering: reduce dimensionality    
        pca = decomposition.PCA(n_components='mle')
        principal_components = pca.fit_transform(features)
        principal_components = pd.DataFrame(principal_components)
        columns = [f'PC{index}' for index in principal_components]
        principal_components.columns = columns
        return pd.concat([principal_components, target], axis=1)
    
    def kmeans(self, features, num_clusters):
        #Classify using k-means
        k_means = KMeans(num_clusters).fit(features)
        identified_clusters = k_means.predict(features)
        return pd.concat([pd.DataFrame(features), pd.DataFrame(identified_clusters)], axis=1)

    #Supervised learning algorithm
    def make_ds(self, features, split=0.001):
        data = self.kmeans(features, num_clusters=2)
        
        train_labels = data.iloc[:int(len(data)*split),-1]
        train_data = data.iloc[:int(len(data)*split),:-2]
        
        test_labels = data.iloc[int(len(data)*split):,-1]
        test_data = data.iloc[int(len(data)*split):,:-2]
        
        return train_data, train_labels, test_data, test_labels
    
    def logistic_regression(self, features):
        train_data, train_labels, test_data, test_labels = self.make_ds(features)
        clf = LogisticRegression(random_state=0).fit(train_data, train_labels)
        prediction = clf.predict(test_data)
        return confusion_matrix(test_labels.to_numpy(), prediction)
    
    def svm(self, features):
        train_data, train_labels, test_data, test_labels = self.make_ds(features)
        clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
        clf.fit(train_data, train_labels)
        prediction = clf.predict(test_data)
        return confusion_matrix(test_labels.to_numpy(), prediction)
    
    def knn(self, features, n_neighbors=3):
        train_data, train_labels, test_data, test_labels = self.make_ds(features)
        neigh = KNeighborsClassifier(n_neighbors=n_neighbors)
        neigh.fit(train_data, train_labels)
        prediction = neigh.predict(test_data)
        return confusion_matrix(test_labels.to_numpy(), prediction)

In [348]:
data = Data()

In [349]:
data.get('sales_data.csv')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4093 entries, 0 to 4092
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   PRODUCT_ID        4093 non-null   int64  
 1   CUSTOMER_ID       4093 non-null   int64  
 2   LOCATION_ID       4093 non-null   int64  
 3   TRANSACTION_DATE  4093 non-null   object 
 4   PRICE_PER_UNIT    4093 non-null   float64
 5   QUANTITY_SOLD     4093 non-null   int64  
dtypes: float64(1), int64(4), object(1)
memory usage: 192.0+ KB


In [350]:
data.raw

Unnamed: 0,PRODUCT_ID,CUSTOMER_ID,LOCATION_ID,TRANSACTION_DATE,PRICE_PER_UNIT,QUANTITY_SOLD
0,17,1,1,12/6/2017,12.42,44
1,22,1,1,7/15/2020,16.00,84
2,43,1,1,9/6/2020,11.50,26
3,17,1,1,5/8/2017,10.47,60
4,91,1,1,8/6/2020,12.66,34
...,...,...,...,...,...,...
4088,43,4,2,10/11/2018,10.12,22
4089,17,4,2,12/19/2018,10.45,74
4090,43,4,2,8/15/2020,11.50,20
4091,17,4,2,11/22/2019,12.77,36


In [351]:
data.plot(feature='QUANTITY_SOLD', target='PRICE_PER_UNIT')

In [352]:
data.plot(feature='TRANSACTION_DATE', target='QUANTITY_SOLD')

In [353]:
data.raw.columns

Index(['PRODUCT_ID', 'CUSTOMER_ID', 'LOCATION_ID', 'TRANSACTION_DATE',
       'PRICE_PER_UNIT', 'QUANTITY_SOLD'],
      dtype='object')

In [354]:
# define features used to predict target(s) variable
features = np.array((data.raw[['PRODUCT_ID', 'CUSTOMER_ID', 'LOCATION_ID','QUANTITY_SOLD']]))
target = data.raw[['QUANTITY_SOLD', 'PRICE_PER_UNIT']]

In [355]:
# create new dataset
min_dims_data = data.pca(features=features, target=target)

In [356]:
min_dims_data

Unnamed: 0,PC0,PC1,PC2,QUANTITY_SOLD,PRICE_PER_UNIT
0,20.591670,-16.594350,-1.333751,44,12.42
1,41.394352,17.934596,-1.348801,84,16.00
2,-10.972193,-14.665168,-1.362137,26,11.50
3,30.484612,-4.019388,-1.337282,60,10.47
4,-43.750588,21.301108,-1.423641,34,12.66
...,...,...,...,...,...
4088,-13.448134,-17.805990,1.774299,22,10.12
4089,39.138231,6.986623,1.795182,74,10.45
4090,-14.684752,-19.377860,1.774740,20,11.50
4091,15.642493,-22.878912,1.803568,36,12.77


In [357]:
fig = px.scatter(min_dims_data.loc[:,'PC0'],min_dims_data.loc[:,'QUANTITY_SOLD'])
fig.show()                                                              

HW. What are PC0,...PCNn?

In [358]:
#features = np.array((data.raw[['CUSTOMER_ID']]))

In [359]:
data_with_classes = data.kmeans(features=features, num_clusters=2)

In [360]:
data_with_classes

Unnamed: 0,0,1,2,3,0.1
0,17,1,1,44,1
1,22,1,1,84,1
2,43,1,1,26,0
3,17,1,1,60,1
4,91,1,1,34,0
...,...,...,...,...,...
4088,43,4,2,22,0
4089,17,4,2,74,1
4090,43,4,2,20,0
4091,17,4,2,36,1


In [361]:
data.plot(feature='QUANTITY_SOLD', target='PRICE_PER_UNIT', color=data_with_classes.iloc[:,-1])

In [362]:
pd.DataFrame(data.logistic_regression(features))

Unnamed: 0,0,1
0,2067,1
1,0,2021


In [363]:
pd.DataFrame(data.svm(features))

Unnamed: 0,0,1
0,2021,0
1,1534,534


In [366]:
pd.DataFrame(data.knn(features, n_neighbors=3))

Unnamed: 0,0,1
0,2021,0
1,2068,0
