In [None]:
"""
Univariate analysis on all Transaction data

Histogram: 
Count
mode
variance: measure of dispersion 
std dev: sqrt of variance
coefficient of deviation: (std dev / mean) * 100% 
skewness: symmetry or asymmetry
kurtosis: measure of distribution compared with Normal 

Box plot: 
Min
Max
mean
median 
quantile 
range 

"""

In [None]:
"""
t_dat: dates of transactions, categorical ordinal            
customer_id: customer ID, categorical nominal       
article_id: clothing item ID, categorical nominal        
price: price of item, numerical interval             
sales_channel_id: where item was sold ID, categorical nominal    

"""

In [None]:
import numpy as np
import pandas as pd 
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go


In [None]:
shared_drive = r"G:\.shortcut-targets-by-id\184tVjsIO-GAjbkSakwDbEZ40M5mPpgu4\Capstone\cleaned_data"
google_drive = r"G:\My Drive\Spring_2022\CS554\Project\data"

drive = r"D:\Users\yiboz\Programming\Github\CS554\data"

In [None]:
articles = pd.read_csv(shared_drive+r"\articles_clean.csv")
customers = pd.read_csv(shared_drive+r"\customers_clean.csv")
transactions = pd.read_csv(shared_drive+r"\transactions_train.csv")

In [None]:
def categoryCounter(df, names): 
    
    counts = df.value_counts().to_frame().reset_index().rename(columns={names[0]:"count", "index":names[0]})
    # counts = df.value_counts().to_frame().rename(columns={names[0]:"count"})
    total = counts['count'].sum()
    counts['freq'] = (counts['count'] / total)*100
    
    return counts
    

In [None]:
def maxRow(df): 
    maxC = df['count'].max()
    return df.loc[df['count'] == maxC]
    

In [None]:
def minRow(df): 
    minC = df['count'].min()
    return df.loc[df['count'] == minC]

In [None]:
"""
t_dat
Transaction Date Exploration: 
max: 2019-09-28, 198622, 0.624827
min: 2020-01-01, 12760, 0.040141
mean: 43308.34332425068 
range: 185862
"""

In [None]:
dateCounts = categoryCounter(transactions["t_dat"], ['t_dat'])
dateCounts.head()

In [None]:
maxRow(dateCounts)


In [None]:
minRow(dateCounts)

In [None]:
dateFig1 = px.bar(dateCounts, x="t_dat", y="freq", title="Transaction Date Frequency")
dateFig1.show()

In [None]:
dateFig2 = px.bar(dateCounts, x="t_dat", y="count", title="Transaction Date Counts")
dateFig2.show()

In [None]:
"""
customer_id 
Customer ID exploration 
unique: 1,362,281
max: 1895, 0.005961
min: 1, 131514 rows 
mean: 23.334630667241193

Graph not meaningful, too many unique values 

"""

In [None]:
customerCounts = categoryCounter(transactions["customer_id"], ['customer_id'])
customerCounts.head()

In [None]:
maxRow(customerCounts)

In [None]:
minRow(customerCounts)

In [None]:
meanC = customerCounts['count'].mean()
meanC

In [None]:
"""
article_id 
Article ID exploration:
max: 706016001, 50287, 0.158193
min: 1, 4491 rows 
mean: 304.057734798703 
range: 

"""

In [None]:
articleCounts = categoryCounter(transactions['article_id'], ['article_id'])
articleCounts.head()

In [None]:
articleCounts.count()

In [None]:
maxRow(articleCounts)

In [None]:
minRow(articleCounts)

In [None]:
articleCounts['count'].mean()

In [None]:
articles.head()

In [None]:
articles = articles[['article_id', 'index_group_name']]
articles = articles.set_index('article_id')

In [None]:
joinedArticles = articleCounts.join(articles, on='article_id' )
joinedArticles.head()

In [None]:
joinedArticles['index_group_name'].value_counts()

In [None]:
aggjoinedArticles = joinedArticles.groupby(['index_group_name']).sum().sort_values(by=['count'], ascending=False)
aggjoinedArticles

In [None]:
articleFig1 = px.bar(aggjoinedArticles, x=aggjoinedArticles.index, y="freq", title="Articles Group Frequency")
articleFig1.show()


In [None]:
articleFig2 = px.bar(aggjoinedArticles, x=aggjoinedArticles.index, y="count", title="Articles Group Counts")
articleFig2.show()

In [None]:
"""
price 
Price exploration:
mean     2.782927e-02
std      1.918113e-02
min      1.694915e-05
25%      1.581356e-02
50%      2.540678e-02
75%      3.388136e-02
max      5.915254e-01

sample: 
mean     2.782653e-02
std      1.914895e-02
min      5.084746e-05
25%      1.579661e-02
50%      2.540678e-02
75%      3.388136e-02
max      5.067797e-01

skewness: 3.1105182811393455
kurtosis: 25.681147025714665

"""

In [None]:
prices = transactions['price']
prices.head()

In [None]:
prices.describe()

In [None]:
prices.kurtosis()

In [None]:
prices.skew()

In [None]:
samplePrices = prices.sample(frac=0.10)
samplePrices.head()

In [None]:
samplePrices = samplePrices.to_frame()


In [None]:
samplePrices.describe()

In [None]:
priceFig1 = px.box(samplePrices, y="price", points=False, title="Prices with No Outliers" )
priceFig1.show()

In [None]:
#priceFig1 = ff.create_distplot([prices], ['prices_dist'])
#priceFig1.show()

In [None]:
"""
sales_channel_id 
Sales Channel exploration: 
max: 2, 22379862, 70.402774
min: 1, 9408462, 29.597226
mean:  
range: 

"""

In [None]:
salesCCounts = categoryCounter(transactions['sales_channel_id'], ['sales_channel_id'])
salesCCounts.head()

In [None]:
salesCCounts.mean()

In [None]:
salesCFig1 = px.bar(salesCCounts, x="sales_channel_id", y="count")
salesCFig1.show()

In [None]:
salesCFig2 = px.pie(salesCCounts, values='count', names='sales_channel_id', title='Sales Channel Splits')
salesCFig2.show()