In [23]:
import warnings

warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pandas_profiling import ProfileReport

from matplotlib.ticker import PercentFormatter
plt.rcParams.update({ "figure.figsize" : (8, 5),"axes.facecolor" : "white", "axes.edgecolor":  "black"})
plt.rcParams["figure.facecolor"]= "w"
pd.plotting.register_matplotlib_converters()
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [24]:
df = pd.read_csv(
    "measurementRectanglesMissing.csv"
)  # to not index on user id..as we got duplicates that we need
print(df.columns)

Index(['brand', 'style', 'menWomen', 'name', 'fabric', 'price',
       'maxHeightFront', 'minHeightFront', 'rivetHeightFront', 'maxWidthFront',
       'minWidthFront', 'maxHeightBack', 'minHeightBack', 'maxWidthBack',
       'minWidthBack', 'cutout', 'waistSize', 'updatedStyle', 'group',
       'priceGroup', 'pocketArea', 'rectanglePhone', 'rectanglePen',
       'rectangleWallet', 'rectangleHand', 'rectangeWallet'],
      dtype='object')


# Missing Data

In [27]:
df.price.isnull().sum() 
df_impute = df.copy()

In [28]:
# Imputing with mean value
def impute(series: pd.Series) -> pd.Series:
    mean = series.mean()
    return series.fillna(mean)

In [29]:
df_impute["price"] = impute(df_impute["price"])

In [30]:
df_impute.price.isnull().sum() 

0

## Exercise 1:

Add functions to impute with min, max and add tests


# Let's look at the data

- women jeans are more expensive
- women pockets are smaller

In [31]:
#transforming categorical columns to categorical type: brand     style menWomen
df["brand"] = df["brand"].astype("category")
df["style"] = df["style"].astype("category")
df["menWomen"] = df["menWomen"].astype("category")

### Are women jeans more expensive?

In [32]:
df.price.describe()

count    72.000
mean     80.989
std      45.247
min       9.990
25%      49.980
50%      73.975
75%      95.713
max     249.000
Name: price, dtype: float64

In [33]:
bins = [9, 50, 74, 96, 250]
df['price_binned'] = pd.cut(df['price'], bins)

In [34]:
# - women jeans are more expensive
# women jeans are not really more expensive
pd.crosstab(df["menWomen"], df.price_binned)

price_binned,"(9, 50]","(50, 74]","(74, 96]","(96, 250]"
menWomen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
men,11,8,7,9
women,11,6,11,9


### Are women pockets smaller?

In [35]:
df.pocketArea.describe()

count      80.000
mean     8424.815
std      2699.333
min      4454.429
25%      5905.184
50%      8618.737
75%     10724.592
max     13102.032
Name: pocketArea, dtype: float64

In [36]:
bins = [4454, 5906, 8619, 10725, 13103]
df['pocket_binned'] = pd.cut(df['pocketArea'], bins)

In [37]:
pd.crosstab(df["menWomen"], df.pocket_binned, normalize="columns")

pocket_binned,"(4454, 5906]","(5906, 8619]","(8619, 10725]","(10725, 13103]"
menWomen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
men,0.0,0.1,0.9,1.0
women,1.0,0.9,0.1,0.0


# Get women pocket size score for each brand

to get this score we can do a simple approach and a more complex one

### Simple score with values from 0 to 2, the number of women pants that are larger than average in that brand

- colums I would add: is_size_smaller_than_average ( per brand / per dataset )

- method: get_pocket_size_score -> return a score based on if women pocket sizes are higher than average... by brand in relation to the rest maybe

## Data Transformations
### Exercise 2: 

#### Creating a column with data transformation of another column

colums I would add: is_size_smaller_than_average ( per brand / per dataset )

In [39]:
def is_greater_than_average(series: pd.Series) -> pd.Series :
    avg = series.mean()
    new_series = [0 if x <= avg else 1 for x in series]

    return new_series

In [40]:
df["size_greater_than_average"] = is_greater_than_average(df["pocketArea"])

In [41]:
df["size_greater_than_average"].sum() 
# 41 of the pockets are of size greater than average

41

In [43]:
# the test failed, and if we uncomment the next line we see this failes too
#is_greater_than_average(df["pocketArea"]).sum()

## Exercise 2

Fix test, we have accidentaly stumbled upon a situation where even though something works in the "notebook", when used as a function, and maybe later in a pipeline it could fail... so our test kinda failed for the wrong reasons

Change the test to a type test

Add more tests to this function, how does it work when all values in the series are the same?

# Pocket size scoring

method: get_pocket_size_score -> return a score based on if women pocket sizes are higher than average... by brand in relation to the rest maybe


## Exercise 3

exercise: 
- add method to new script .. called scoring.py for example, 
- add test file in test folder for this
- write test, can the following method be tested as it is written? Does it follow testable functions conventions?

In [None]:
# count number of rows with women and "size_greater_than_average" > 0
# input brand, menWomen, size_greater_than_average -> 80 rows
# output brand, score  -> less ~ 40


aggr = df.groupby(by=["brand", "menWomen"],as_index=False)["size_greater_than_average"].sum()

In [None]:
def get_sum_score_by_brand_and_gender(frame: pd.DataFrame, brand_col="brand", gender_col="menWomen", score_by="size_greater_than_average") -> pd.DataFrame :
    aggr = frame.groupby(by=[brand_col, gender_col],as_index=False)[score_by].sum()

    return aggr

In [None]:
aggr = get_sum_score_by_brand_and_gender(df, "brand", "menWomen", "size_greater_than_average")
aggr

In [None]:
aggr[aggr.menWomen == "women"]

In [None]:
#todo: write the test

# Homework


## a less all or nothing score

we sort the pocket areas and assing to each row the index of the area value ( values should be 0 - dataframe size)
- brand women score = average position (sum / 2)
- brand men score = average position (sum / 2)
- score = women / men

todo: 
- write this as a testable method, add this to the scoring.py .. might be more than one
- write test

In [None]:
df_small = df[["brand", "menWomen", "pocketArea"]]

In [None]:
df_small = df_small.sort_values("pocketArea").reset_index(drop=True)

In [None]:
df_small['rank'] = df_small.index + 1

In [None]:
df_small

In [None]:
aggr = df_small.groupby(by=["brand", "menWomen"],as_index=False)["rank"].sum()

In [None]:
aggr