# Exploratory data analisys

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

## H&M recommendations

In [None]:
articles = pd.read_csv('articles.csv.zip')

In [None]:
articles[:5]

In [None]:
articles.index_name.unique()

In [None]:
_, ax = plt.subplots(figsize=(15, 7))
ax = sns.histplot(data=articles, y='index_name', color='green')
ax.set_xlabel('Count by index name')
ax.set_ylabel('Index name')
plt.show()

**To identify indexes accounting for a largest and smallest shares of all items.**

In [None]:
_, ax = plt.subplots(figsize=(15, 7))
ax = sns.histplot(data=articles, y='garment_group_name', color='orange', 
                  hue='index_group_name', multiple='stack')
ax.set_xlabel('Count by garment group')
ax.set_ylabel('Garment group')
plt.show()

**To identify most popular garments within index.**

In [None]:
temp = articles.groupby(['product_group_name'])['product_type_name'].nunique()
df = pd.DataFrame({'Product group': temp.index,
                   'Product types number': temp.values
                  })
df = df.sort_values(['Product types number'], ascending=False)
plt.figure(figsize = (8,6))
plt.title('Number of Product types per each Product group')
s = sns.barplot(x = 'Product group', y='Product types number', data=df)
s.set_xticklabels(s.get_xticklabels(),rotation=90)
locs, labels = plt.xticks()
plt.show()

**To identify number of Product types per each Product group** 

In [None]:
temp = articles.groupby(['perceived_colour_master_name'])['article_id'].nunique()
df = pd.DataFrame({'Perceived colour master name': temp.index,
                   'Num of articles': temp.values
                  })
df = df.sort_values(['Num of articles'], ascending=False)
plt.figure(figsize = (12,6))
plt.title(f'Number of Articles per each Perceived colour master Name')
s = sns.barplot(x = 'Perceived colour master name', y='Num of articles', data=df)
s.set_xticklabels(s.get_xticklabels(),rotation=90)
locs, labels = plt.xticks()
plt.show()


**To identify the most popular colors**

In [None]:
from wordcloud import WordCloud, STOPWORDS
stopwords = set(STOPWORDS)

def show_wordcloud(data, title = None):
    wordcloud = WordCloud(
        background_color='white',
        stopwords=stopwords,
        max_words=200,
        max_font_size=40, 
        scale=5,
        random_state=1
    ).generate(str(data))

    fig = plt.figure(1, figsize=(10,10))
    plt.axis('off')
    if title: 
        fig.suptitle(title, fontsize=14)
        fig.subplots_adjust(top=2.3)

    plt.imshow(wordcloud)
    plt.show()

In [None]:
show_wordcloud(articles['prod_name'], 'Wordcloud from product name')

**To identify frequent words vissually.**

In [None]:
show_wordcloud(articles['detail_desc'], 'Wordcloud from detailed description of items')

In [None]:
customers = pd.read_csv('customers.csv.zip')
customers[:10]

In [None]:
sns.set_style('darkgrid')

_, ax = plt.subplots(figsize=(10,5))
ax = sns.histplot(data=customers, x='age', bins=50, color='green')
ax.set_xlabel('Distribution of the customers age')
plt.show()

**To identify the most common age.**

In [None]:
transactions = pd.read_csv('transactions_train.csv.zip')

In [None]:
transactions[:10]

In [None]:
merged = transactions[['customer_id', 'article_id', 
                                   'price', 't_dat']].merge(articles[['article_id', 'prod_name', 
                                                                      'product_type_name', 'product_group_name', 
                                                                      'index_name']], on='article_id', how='left')


In [None]:
articles_index = merged[['index_name', 'price']].groupby('index_name').mean()
sns.set_style('darkgrid')
_, ax = plt.subplots(figsize=(10,5))
ax = sns.barplot(x=articles_index.price, y=articles_index.index, color='green', alpha=0.8)
ax.set_xlabel('Price by index')
ax.set_ylabel('Index')
plt.show()

**To identify indexes with the highest and lowest mean prices.**

In [None]:
from datetime import datetime

grouped = transactions.sample(200000).groupby(['t_dat', 'sales_channel_id'])['article_id'].count().reset_index()
grouped['t_dat'] = grouped['t_dat'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))

grouped.columns = ['Date', 'Sales Channel Id', "Transactions"]

_, ax = plt.subplots(1, 1, figsize=(16,6))
g1 = ax.plot(grouped.loc[grouped['Sales Channel Id']==1, 'Date'], 
             grouped.loc[grouped['Sales Channel Id']==1, 'Transactions'], label='Sales Channel 1')

g2 = ax.plot(grouped.loc[grouped["Sales Channel Id"]==2, 'Date'], 
             grouped.loc[grouped["Sales Channel Id"]==2, 'Transactions'], label='Sales Channel 2')

plt.xlabel('Date')
plt.ylabel('Num of transactions')
ax.legend()
plt.title(f'Number of transactions per day, grouped by Sales channel (200k sample)')
plt.show()

## AMEX

In [None]:
def plot_time_series(prefix='D', cols=None, display_ct=32):
    
    # DETERMINE WHICH COLUMNS TO PLOT
    if cols is not None and len(cols)==0: cols = None
    if cols is None:
        COLS = df.columns[2:-1]
        COLS = np.sort( [int(x[2:]) for x in COLS if x[0]==prefix] )
        COLS = [f'{prefix}_{x}' for x in COLS]
        print('#'*25)
        print(f'Plotting all {len(COLS)} columns with prefix {prefix}')
        print('#'*25)
    else:
        COLS = [f'{prefix}_{x}' for x in cols]
        print('#'*25)
        print(f'Plotting {len(COLS)} columns with prefix {prefix}')
        print('#'*25)

    # ITERATE COLUMNS
    for c in COLS:

        # CONVERT DATAFRAME INTO SERIES WITH COLUMN
        tmp = df[['customer_ID','S_2',c,'target']].copy()
        tmp2 = tmp.groupby(['customer_ID','target'])[['S_2',c]].agg(list).reset_index()
        tmp3 = tmp2.loc[tmp2.target==1]
        tmp4 = tmp2.loc[tmp2.target==0]

        # FORMAT PLOT
        spec = gridspec.GridSpec(ncols=2, nrows=1,
                             width_ratios=[3, 1], wspace=0.1,
                             hspace=0.5, height_ratios=[1])
        fig = plt.figure(figsize=(20,10))
        ax0 = fig.add_subplot(spec[0])

        # PLOT 32 DEFAULT CUSTOMERS AND 32 NON-DEFAULT CUSTOMERS
        t0 = []; t1 = []
        for k in range(display_ct):
            try:
                # PLOT DEFAULTING CUSTOMERS
                row = tmp3.iloc[k]
                ax0.plot(row.S_2,row[c],'-o',color='blue')
                t1 += row[c]
                # PLOT NON-DEFAULT CUSTOMERS
                row = tmp4.iloc[k]
                ax0.plot(row.S_2,row[c],'-o',color='orange')
                t0 += row[c]
            except:
                pass
        plt.title(f'Feature {c} (Key: BLUE=DEFAULT, orange=no default)',size=18)

        # PLOT HISTOGRAMS
        ax1 = fig.add_subplot(spec[1])
        try:
            # COMPUTE BINS
            t = t0+t1; mn = np.nanmin(t); mx = np.nanmax(t)
            if mx==mn:
                mx += 0.01; mn -= 0.01
            bins = np.arange(mn,mx+(mx-mn)/20,(mx-mn)/20 )
            # PLOT HISTOGRAMS
            if np.sum(np.isnan(t1))!=len(t1):
                ax1.hist(t1,bins=bins,orientation="horizontal",alpha = 0.8,color='blue')
            if np.sum(np.isnan(t0))!=len(t0):
                ax1.hist(t0,bins=bins,orientation="horizontal",alpha = 0.8,color='orange')
        except:
            pass
        plt.show()

In [None]:
from matplotlib import gridspec

def plot_customers(data, col, date_col, display_ct=32):
    
        tmp = data.copy()
        tmp2 = tmp.groupby(['customer_ID','target'])[[date_col, col]].agg(list).reset_index()
        tmp3 = tmp2.loc[tmp2.target==1]
        tmp4 = tmp2.loc[tmp2.target==0]

        # FORMAT PLOT
        spec = gridspec.GridSpec(ncols=2, nrows=1,
                             width_ratios=[3, 1], wspace=0.1,
                             hspace=0.5, height_ratios=[1])
        fig = plt.figure(figsize=(20,10))
        ax0 = fig.add_subplot(spec[0])

        # PLOT 32 DEFAULT CUSTOMERS AND 32 NON-DEFAULT CUSTOMERS
        t0 = []
        t1 = []
        for k in range(display_ct):
            try:
                # PLOT DEFAULTING CUSTOMERS
                row = tmp3.iloc[k]
                ax0.plot(row[date_col], row[col], '-o', color='blue')
                t1 += row[col]
                # PLOT NON-DEFAULT CUSTOMERS
                row = tmp4.iloc[k]
                ax0.plot(row[date_col], row[col],'-o', color='orange')
                t0 += row[col]
            except:
                pass
        plt.title(f'Feature {col} (Key: blue=default, orange=non default)',size=18)

        # PLOT HISTOGRAMS
        ax1 = fig.add_subplot(spec[1])
        try:
            # COMPUTE BINS
            t = t0 + t1
            mn = np.nanmin(t)
            mx = np.nanmax(t)
            if mx == mn:
                mx += 0.01
                mn -= 0.01
            bins = np.arange(mn, mx+(mx-mn)/20, (mx-mn)/20 )
            # PLOT HISTOGRAMS
            if np.sum(np.isnan(t1))!=len(t1):
                ax1.hist(t1,bins=bins,orientation="horizontal",alpha = 0.8,color='blue')
            if np.sum(np.isnan(t0))!=len(t0):
                ax1.hist(t0,bins=bins,orientation="horizontal",alpha = 0.8,color='orange')
        except:
            pass
        
        plt.show()

In [None]:
train_data = pd.read_csv('train_data.csv.zip', nrows=100000)

train_data.S_2 = pd.to_datetime(train_data.S_2)

train_labels = pd.read_csv('train_labels.csv.zip')

train_data = train_data.merge(train_labels,on='customer_ID', how='left')

In [None]:
plot_customers(train_data, 'D_41', 'S_2')

In [None]:
plot_customers(train_data, 'P_2', 'S_2')

In [None]:
plot_customers(train_data, 'D_96', 'S_2')