In [40]:
#Imports
from datascience import *
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import os
%matplotlib inline

https://stockx.com/news/the-2019-data-contest/

The data in this sheet consist of a random sample of all U.S. Off-White x Nike and Yeezy 350 sales from between 9/1/2017 and 2/13/2019. 

To create this sample, StockX took a random, fixed percentage of their sales (X%) for each colorway, on each day, since September 2017. So, for each day the Off-White Jordan 1 was on the market, they randomly selected X% of its sale from each day. (It’s not important to know what X is; all that matters is that it’s a random sample, and that the same fixed X% of sales was selected from every day, for every sneaker).

They've included 8 variables for us to work with: Order Date, Brand, Sneaker Name, Sale Price ($), Retail Price ($), Release Date, Shoe Size, and Buyer State (the U.S. state the buyer shipped to). You can use whatever variables you want in the analysis; you can use 1 variable, or you can use all 8. And remember, every row in the spreadsheet represents an individual StockX sale. There are no averages or order counts; this is just a random sample of daily sales data.

In [41]:
sneakerdata = pd.read_excel('StockX-Data-Contest-2019-3.xlsx')

In [42]:
sneakerdata.shape

(99956, 8)

In [43]:
sneakerdata.head()

Unnamed: 0,Order Date,Brand,Sneaker Name,Sale Price,Retail Price,Release Date,Shoe Size,Buyer Region
0,2017-09-01,Yeezy,Adidas-Yeezy-Boost-350-Low-V2-Beluga,1097.0,220,2016-09-24,11.0,California
1,2017-09-01,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Copper,685.0,220,2016-11-23,11.0,California
2,2017-09-01,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Green,690.0,220,2016-11-23,11.0,California
3,2017-09-01,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Red,1075.0,220,2016-11-23,11.5,Kentucky
4,2017-09-01,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Red-2017,828.0,220,2017-02-11,11.0,Rhode Island


##### Data Cleaning

Let's check for any missing data.

In [44]:
sneakerdata.isnull().sum()

Order Date      0
Brand           0
Sneaker Name    0
Sale Price      0
Retail Price    0
Release Date    0
Shoe Size       0
Buyer Region    0
dtype: int64

There are no missing data. Let's now check if the data types of the columns are consistent with the values in the columns.

In [45]:
sneakerdata.dtypes

Order Date      datetime64[ns]
Brand                   object
Sneaker Name            object
Sale Price             float64
Retail Price             int64
Release Date    datetime64[ns]
Shoe Size              float64
Buyer Region            object
dtype: object

The data is consistent with the type of values.

The data seems to be about shoe sales, but there is no profit column. Profit is the difference between the sale price and the retail price. Let's add a profit column to our data frame.

In [46]:
sneakerdata['Profit'] = sneakerdata['Sale Price'] - sneakerdata['Retail Price']

In [47]:
sneakerdata.head()

Unnamed: 0,Order Date,Brand,Sneaker Name,Sale Price,Retail Price,Release Date,Shoe Size,Buyer Region,Profit
0,2017-09-01,Yeezy,Adidas-Yeezy-Boost-350-Low-V2-Beluga,1097.0,220,2016-09-24,11.0,California,877.0
1,2017-09-01,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Copper,685.0,220,2016-11-23,11.0,California,465.0
2,2017-09-01,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Green,690.0,220,2016-11-23,11.0,California,470.0
3,2017-09-01,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Red,1075.0,220,2016-11-23,11.5,Kentucky,855.0
4,2017-09-01,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Red-2017,828.0,220,2017-02-11,11.0,Rhode Island,608.0


Let's re-order the columns for aesthetics so that 'Profit' is next to 'Sale Price' and 'Retail Price'

In [48]:
sneakerdata = sneakerdata[['Order Date', 'Brand', 'Sneaker Name', 'Sale Price', 'Retail Price','Profit','Release Date','Shoe Size','Buyer Region']]
sneakerdata.head()

Unnamed: 0,Order Date,Brand,Sneaker Name,Sale Price,Retail Price,Profit,Release Date,Shoe Size,Buyer Region
0,2017-09-01,Yeezy,Adidas-Yeezy-Boost-350-Low-V2-Beluga,1097.0,220,877.0,2016-09-24,11.0,California
1,2017-09-01,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Copper,685.0,220,465.0,2016-11-23,11.0,California
2,2017-09-01,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Green,690.0,220,470.0,2016-11-23,11.0,California
3,2017-09-01,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Red,1075.0,220,855.0,2016-11-23,11.5,Kentucky
4,2017-09-01,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Red-2017,828.0,220,608.0,2017-02-11,11.0,Rhode Island


Let's take a look at the different shoe brands.

In [49]:
sneakerdata['Brand'].unique()

array([' Yeezy', 'Off-White'], dtype=object)

There seems to be an uncessary space before the word Yeezy in the data. Let us get rid of it using RegEx. Using this idea: https://stackoverflow.com/questions/25698710/replace-all-occurrences-of-a-string-in-a-pandas-dataframe-python  we will replace ' Yeezy' with 'Yeezy'

In [50]:
sneakerdata[sneakerdata['Brand'] == " Yeezy"].shape[0] ##checking initial rows

72162

In [51]:
sneakerdata[sneakerdata['Brand'] == "Off-White"].shape[0] ##checking initial rows

27794

In [52]:
sneakerdata['Brand'] = sneakerdata['Brand'].replace({' Yeezy': 'Yeezy'}, regex=True)

In [53]:
sneakerdata[sneakerdata['Brand'] == "Yeezy"].shape[0] ##rows match, the regex worked

72162

In [54]:
sneakerdata[sneakerdata['Brand'] == "Off-White"].shape[0] ##rows match, the regex worked

27794

In [55]:
sneakerdata['Buyer Region'].unique()

array(['California', 'Kentucky', 'Rhode Island', 'Michigan', 'New York',
       'Kansas', 'Florida', 'New Jersey', 'Texas', 'North Carolina',
       'Oregon', 'Alabama', 'Delaware', 'Virginia', 'Wisconsin',
       'Colorado', 'Massachusetts', 'Pennsylvania', 'Louisiana',
       'Washington', 'Georgia', 'Ohio', 'Nebraska', 'Oklahoma',
       'Connecticut', 'Missouri', 'South Carolina', 'Maine', 'Illinois',
       'Nevada', 'Maryland', 'Arizona', 'Minnesota', 'Iowa', 'Tennessee',
       'West Virginia', 'Indiana', 'Arkansas', 'Alaska', 'Wyoming',
       'Utah', 'New Hampshire', 'Vermont', 'District of Columbia',
       'Hawaii', 'New Mexico', 'South Dakota', 'Mississippi',
       'North Dakota', 'Idaho', 'Montana'], dtype=object)

##### Data Exploration

In my data exploration, I will try to answer some thought-provoking questions about the dataset that I will come up with myself in order to find some interesting results.

#### 1. Which shoe size generated the most average profit? For Yeezy's? For Off-whites? What about the least average profit?

First, let us see the unique shoe sizes for yeezy's and off-whites, respectively.

In [56]:
yeezy = sneakerdata[sneakerdata['Brand'] == "Yeezy"]

In [57]:
offwhite = sneakerdata[sneakerdata['Brand'] == "Off-White"]

In [58]:
yeezy.groupby(['Shoe Size']).mean()['Profit'].reset_index().sort_values('Profit', ascending = False).set_index('Shoe Size')['Profit']

Shoe Size
17.0    615.333333
16.0    482.066667
14.5    230.440476
13.5    172.523810
12.5    168.353464
14.0    156.344124
11.0    152.163142
13.0    151.477338
9.5     144.985291
10.0    143.860640
10.5    143.796080
6.5     143.208804
11.5    142.121689
12.0    141.474104
8.5     141.041237
6.0     140.270868
8.0     138.008518
9.0     132.979267
3.5     129.000000
5.5     127.116979
7.5     126.265436
7.0     125.619676
5.0     123.892903
4.5     114.205781
4.0     107.985032
Name: Profit, dtype: float64

In [59]:
offwhite.groupby(['Shoe Size']).mean()['Profit'].reset_index().sort_values('Profit', ascending = False).set_index('Shoe Size')['Profit']

Shoe Size
17.0    2060.000000
16.0    1490.250000
14.0     551.646847
12.5     536.092308
9.0      535.184513
10.0     530.122029
8.0      525.135565
9.5      511.911618
11.0     504.027723
8.5      492.487328
12.0     491.154280
15.0     490.515385
7.5      489.797904
13.0     472.214069
10.5     462.443742
11.5     458.529817
7.0      444.071095
5.0      438.117647
6.0      428.859061
4.5      416.850825
4.0      407.490314
6.5      405.141375
5.5      403.327526
3.5      305.333333
Name: Profit, dtype: float64

From these results based on the average profit for each size, larger sizes tend to have more profit than smaller sizes. This is actually contradictory to what real life profit entails. In real life, smaller sizes in general do better because they are made in less amounts than bigger sizes. Perhaps since this is a small subset of the data overall, we see that bigger sizes do better in profit.

#### 2. Which region tends to buy more Yeezy's? More Off-whites?

In [60]:
yeezy.groupby(['Buyer Region']).count().reset_index().sort_values('Order Date', ascending = False).set_index('Buyer Region')['Order Date'][0:10]

Buyer Region
California       13113
New York         12103
Oregon            5396
Florida           4484
Texas             4455
New Jersey        3371
Illinois          2732
Pennsylvania      2396
Michigan          2209
Massachusetts     2189
Name: Order Date, dtype: int64

In [61]:
offwhite.groupby(['Buyer Region']).count().reset_index().sort_values('Order Date', ascending = False).set_index('Buyer Region')['Order Date'][0:10]

Buyer Region
California       6236
New York         4422
Oregon           2285
Florida          1892
Texas            1421
New Jersey       1349
Illinois         1050
Massachusetts     714
Pennsylvania      713
Virginia          605
Name: Order Date, dtype: int64

Based on these results, it seems that the top 10 regions that tend to buy one brand tend to buy the other, with California being at the top.

#### 3. Which yeezy has the most profit on average? Least Profit?  Which Off-white has the most profit on average? Least Profit?

In [62]:
yeezy.groupby(['Sneaker Name']).mean()['Profit'].reset_index().sort_values('Profit', ascending = False).set_index('Sneaker Name')

Unnamed: 0_level_0,Profit
Sneaker Name,Unnamed: 1_level_1
Adidas-Yeezy-Boost-350-Low-Turtledove,1331.661765
Adidas-Yeezy-Boost-350-Low-Oxford-Tan,811.514286
Adidas-Yeezy-Boost-350-Low-Moonrock,796.71
Adidas-Yeezy-Boost-350-Low-Pirate-Black-2016,784.237705
Adidas-Yeezy-Boost-350-V2-Core-Black-Red,717.754967
Adidas-Yeezy-Boost-350-Low-Pirate-Black-2015,695.094737
Adidas-Yeezy-Boost-350-V2-Core-Black-Red-2017,584.304438
Adidas-Yeezy-Boost-350-Low-V2-Beluga,568.704819
Adidas-Yeezy-Boost-350-V2-Core-Black-White,498.412238
Adidas-Yeezy-Boost-350-V2-Core-Black-Copper,436.631111


The Yeezy with the most average profit is the 350 low turtledove while the lowest is the 350 v2 sesame.

In [63]:
offwhite.groupby(['Sneaker Name']).mean()['Profit'].reset_index().sort_values('Profit', ascending = False).set_index('Sneaker Name')

Unnamed: 0_level_0,Profit
Sneaker Name,Unnamed: 1_level_1
Air-Jordan-1-Retro-High-Off-White-White,1636.068894
Air-Jordan-1-Retro-High-Off-White-Chicago,1579.8
Nike-Air-Presto-Off-White,1076.055369
Nike-Air-Force-1-Low-Virgil-Abloh-Off-White-AF100,825.516129
Nike-Air-Max-97-Off-White-Elemental-Rose-Queen,703.782407
Air-Jordan-1-Retro-High-Off-White-University-Blue,660.853503
Nike-Blazer-Mid-Off-White-Wolf-Grey,654.822917
Nike-Air-VaporMax-Off-White,606.67619
Nike-Air-Presto-Off-White-Black-2018,570.469307
Nike-Blazer-Mid-Off-White,565.47505


The Offwhite with the most average profit is the air jordan 1 retro high off white while the least average profit is the nike zoom fly mercurial off white total orange.

In [64]:
yeezy['Sneaker Name'].unique()

array(['Adidas-Yeezy-Boost-350-Low-V2-Beluga',
       'Adidas-Yeezy-Boost-350-V2-Core-Black-Copper',
       'Adidas-Yeezy-Boost-350-V2-Core-Black-Green',
       'Adidas-Yeezy-Boost-350-V2-Core-Black-Red',
       'Adidas-Yeezy-Boost-350-V2-Core-Black-Red-2017',
       'Adidas-Yeezy-Boost-350-V2-Core-Black-White',
       'Adidas-Yeezy-Boost-350-V2-Cream-White',
       'Adidas-Yeezy-Boost-350-V2-Zebra',
       'Adidas-Yeezy-Boost-350-Low-Moonrock',
       'Adidas-Yeezy-Boost-350-Low-Pirate-Black-2016',
       'Adidas-Yeezy-Boost-350-Low-Oxford-Tan',
       'Adidas-Yeezy-Boost-350-Low-Turtledove',
       'Adidas-Yeezy-Boost-350-Low-Pirate-Black-2015',
       'Adidas-Yeezy-Boost-350-V2-Semi-Frozen-Yellow',
       'Adidas-Yeezy-Boost-350-V2-Beluga-2pt0',
       'Adidas-Yeezy-Boost-350-V2-Blue-Tint',
       'adidas-Yeezy-Boost-350-V2-Butter',
       'Adidas-Yeezy-Boost-350-V2-Sesame',
       'adidas-Yeezy-Boost-350-V2-Static',
       'adidas-Yeezy-Boost-350-V2-Static-Reflective'], dtype=objec

In [65]:
yeezy['Buyer Region'].unique()

array(['California', 'Kentucky', 'Rhode Island', 'Michigan', 'New York',
       'Kansas', 'Florida', 'New Jersey', 'Texas', 'North Carolina',
       'Oregon', 'Alabama', 'Delaware', 'Virginia', 'Wisconsin',
       'Colorado', 'Massachusetts', 'Pennsylvania', 'Louisiana',
       'Washington', 'Georgia', 'Ohio', 'Nebraska', 'Oklahoma',
       'Connecticut', 'Missouri', 'South Carolina', 'Maine', 'Illinois',
       'Nevada', 'Maryland', 'Minnesota', 'Iowa', 'Arizona',
       'West Virginia', 'Indiana', 'Tennessee', 'Wyoming', 'Vermont',
       'District of Columbia', 'Arkansas', 'Hawaii', 'New Mexico',
       'New Hampshire', 'South Dakota', 'Utah', 'Mississippi',
       'North Dakota', 'Idaho', 'Alaska', 'Montana'], dtype=object)

#### 4. In which region did each of the Yeezy's garner the most average profit?

In [66]:
test = yeezy[(yeezy['Sneaker Name'] == 'Adidas-Yeezy-Boost-350-Low-V2-Beluga') & (yeezy['Buyer Region'] == 'California')]
test.head()

Unnamed: 0,Order Date,Brand,Sneaker Name,Sale Price,Retail Price,Profit,Release Date,Shoe Size,Buyer Region
0,2017-09-01,Yeezy,Adidas-Yeezy-Boost-350-Low-V2-Beluga,1097.0,220,877.0,2016-09-24,11.0,California
54,2017-09-04,Yeezy,Adidas-Yeezy-Boost-350-Low-V2-Beluga,960.0,220,740.0,2016-09-24,9.0,California
206,2017-09-11,Yeezy,Adidas-Yeezy-Boost-350-Low-V2-Beluga,1050.0,220,830.0,2016-09-24,11.0,California
327,2017-09-15,Yeezy,Adidas-Yeezy-Boost-350-Low-V2-Beluga,790.0,220,570.0,2016-09-24,7.0,California
328,2017-09-15,Yeezy,Adidas-Yeezy-Boost-350-Low-V2-Beluga,800.0,220,580.0,2016-09-24,7.0,California


In [67]:
test = test.groupby('Sneaker Name').mean().reset_index()
test['Buyer Region'] = 'California'
test_result = test[['Sneaker Name', "Profit", "Buyer Region"]]
test_result

Unnamed: 0,Sneaker Name,Profit,Buyer Region
0,Adidas-Yeezy-Boost-350-Low-V2-Beluga,576.756522,California


In [68]:
test_result_2 = pd.DataFrame({'Sneaker Name':['Adidas-Yeezy-Boost-350-Low-V2-Beluga'], 'Profit': [655.123456], 'Buyer Region':['California']}, columns = ['Sneaker Name', 'Profit', 'Buyer Region'])

In [69]:
main_yeezy_df = pd.DataFrame()

check_each_country_df = pd.DataFrame()
check_each_country_df = check_each_country_df.append(test_result)
check_each_country_df = check_each_country_df.append(test_result_2)

In [70]:
check_each_country_df

Unnamed: 0,Sneaker Name,Profit,Buyer Region
0,Adidas-Yeezy-Boost-350-Low-V2-Beluga,576.756522,California
0,Adidas-Yeezy-Boost-350-Low-V2-Beluga,655.123456,California


In [71]:
check_each_country_df_max = check_each_country_df.sort_values('Profit',ascending = False)[0:1]

In [72]:
main_yeezy_df = main_yeezy_df.append(check_each_country_df_max)
main_yeezy_df

Unnamed: 0,Sneaker Name,Profit,Buyer Region
0,Adidas-Yeezy-Boost-350-Low-V2-Beluga,655.123456,California


In [73]:
# ##template code or all the scratch work above:
# yeezy['Sneaker Name'].unique()
# yeezy['Buyer Region'].unique()
# main_yeezy_df = pd.DataFrame()
# check_each_country_df = pd.DataFrame()

# test = yeezy[(yeezy['Sneaker Name'] == 'Adidas-Yeezy-Boost-350-Low-V2-Beluga') & (yeezy['Buyer Region'] == 'California')]
# test = test.groupby('Sneaker Name').mean().reset_index()
# test['Buyer Region'] = 'California'

# test_result = test[['Sneaker Name', "Profit", "Buyer Region"]]
# ##next
# test_result_2 = pd.DataFrame({'Sneaker Name':['Adidas-Yeezy-Boost-350-Low-V2-Beluga'], 'Profit': [655.123456], 'Buyer Region':['California']}, columns = ['Sneaker Name', 'Profit', 'Buyer Region'])

# check_each_country_df = check_each_country_df.append(test_result)
# check_each_country_df = check_each_country_df.append(test_result_2)

# check_each_country_df_max = check_each_country_df.sort_values('Profit',ascending = False)[0:1]
# main_yeezy_df = main_yeezy_df.append(check_each_country_df_max)

In [79]:
##template code:
# yeezy['Sneaker Name'].unique()
# yeezy['Buyer Region'].unique()

main_yeezy_df = pd.DataFrame()
check_each_country_df = pd.DataFrame()

##iterate through the unique sneaker names and regions here:
for i in yeezy['Sneaker Name'].unique():
    yeezy_in_list = yeezy[yeezy['Sneaker Name'] == i]
    for j in yeezy['Buyer Region'].unique():
        potential_yeezy = yeezy_in_list[(yeezy_in_list['Buyer Region'] == j)]
        potential_yeezy = potential_yeezy.groupby('Sneaker Name').mean().reset_index()
        potential_yeezy['Buyer Region'] = j
        yeezy_result = potential_yeezy[['Sneaker Name', "Profit", "Buyer Region"]]
        ##append each country's results to df:
        check_each_country_df = check_each_country_df.append(yeezy_result)
    ##select the highest country with the max value:
    main_yeezy_df = main_yeezy_df.append(check_each_country_df.sort_values('Profit',ascending = False)[0:1], ignore_index = True)
    
    

In [80]:
main_yeezy_df

Unnamed: 0,Sneaker Name,Profit,Buyer Region
0,Adidas-Yeezy-Boost-350-Low-V2-Beluga,823.545455,Kentucky
1,Adidas-Yeezy-Boost-350-Low-V2-Beluga,823.545455,Kentucky
2,Adidas-Yeezy-Boost-350-Low-V2-Beluga,823.545455,Kentucky
3,Adidas-Yeezy-Boost-350-V2-Core-Black-Red,872.0,Alabama
4,Adidas-Yeezy-Boost-350-V2-Core-Black-Red,872.0,Alabama
5,Adidas-Yeezy-Boost-350-V2-Core-Black-Red,872.0,Alabama
6,Adidas-Yeezy-Boost-350-V2-Core-Black-Red,872.0,Alabama
7,Adidas-Yeezy-Boost-350-V2-Core-Black-Red,872.0,Alabama
8,Adidas-Yeezy-Boost-350-Low-Moonrock,1082.5,Michigan
9,Adidas-Yeezy-Boost-350-Low-Moonrock,1082.5,Michigan


#### 5. Hypothesis Testing: A/B testing

Random sample of sneakers. Compare:

(A) Profit of sneakers from Yeezy's

(B) Profit of sneakers from Off-whites

Question: Could the difference be due to chance alone?
Or are the profits from off-whites from a distribution with a smaller average?

##### Null Hypothesis: 
In the population, the distributions of the profits of the yeezy's and off-whites in the two groups are the same. (They are different in the sample just due to chance).

##### Alternative Hypothesis:
In the population, the profits of the yeezy's are, on average, lower than the profits of the off-whites.

Group A: yeezy's profits

Group B: off-whites profits

Statistic: Difference between average profits  (Note: Small values of this statistic favor the alternative)

Group A average - Group B average

If the null is true, all rearrangements of the profits among the two groups are equally likely

##### Plan:
○ Shuffle all the profits

○ Assign some to “Group A” and the rest to “Group B”, maintaining the two sample sizes

○ Find the difference between the averages of the two shuffled groups

○ Repeat

In [87]:
sneakerdata.head()

Unnamed: 0,Order Date,Brand,Sneaker Name,Sale Price,Retail Price,Profit,Release Date,Shoe Size,Buyer Region
0,2017-09-01,Yeezy,Adidas-Yeezy-Boost-350-Low-V2-Beluga,1097.0,220,877.0,2016-09-24,11.0,California
1,2017-09-01,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Copper,685.0,220,465.0,2016-11-23,11.0,California
2,2017-09-01,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Green,690.0,220,470.0,2016-11-23,11.0,California
3,2017-09-01,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Red,1075.0,220,855.0,2016-11-23,11.5,Kentucky
4,2017-09-01,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Red-2017,828.0,220,608.0,2017-02-11,11.0,Rhode Island


In [88]:
yeezy_off_white = sneakerdata[['Brand','Profit']]

In [92]:
yeezy_off_white.groupby(['Brand']).count().reset_index()

Unnamed: 0,Brand,Profit
0,Off-White,27794
1,Yeezy,72162


In [108]:
average_profits = yeezy_off_white.groupby(['Brand']).mean().reset_index()

In [109]:
average_profits

Unnamed: 0,Brand,Profit
0,Off-White,492.102943
1,Yeezy,140.15873


In [110]:
average_profits['Profit'][1] - average_profits['Profit'][0]

-351.94421309530514

In [116]:
profits = sneakerdata[['Profit']]
profits

Unnamed: 0,Profit
0,877.0
1,465.0
2,470.0
3,855.0
4,608.0
...,...
99951,345.0
99952,378.0
99953,385.0
99954,430.0


In [119]:
shuffled_profits = profits.sample(frac = 1, replace = False)
shuffled_profits

Unnamed: 0,Profit
63988,10.0
62864,621.0
66217,20.0
38983,252.0
24958,179.0
...,...
9224,140.0
94500,415.0
8708,130.0
34171,70.0


In [120]:
groups = sneakerdata[[('Brand')]]
groups

Unnamed: 0,Brand
0,Yeezy
1,Yeezy
2,Yeezy
3,Yeezy
4,Yeezy
...,...
99951,Yeezy
99952,Yeezy
99953,Yeezy
99954,Yeezy


In [140]:
groups['Shuffled Profits'] = pd.Series(shuffled_profits['Profit'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [141]:
simulated_sample = groups

In [143]:
simulated_sample[['Brand', 'Shuffled Profits']]

Unnamed: 0,Brand,Shuffled Profits
0,Yeezy,877.0
1,Yeezy,465.0
2,Yeezy,470.0
3,Yeezy,855.0
4,Yeezy,608.0
...,...,...
99951,Yeezy,345.0
99952,Yeezy,378.0
99953,Yeezy,385.0
99954,Yeezy,430.0


In [157]:
simulated_sample = pd.DataFrame({'Brand':sneakerdata['Brand'], 'Shuffled Profits': shuffled_profits['Profit']})

In [160]:
simulated_sample

Unnamed: 0,Brand,Shuffled Profits
0,Yeezy,877.0
1,Yeezy,465.0
2,Yeezy,470.0
3,Yeezy,855.0
4,Yeezy,608.0
...,...,...
99951,Yeezy,345.0
99952,Yeezy,378.0
99953,Yeezy,385.0
99954,Yeezy,430.0


In [146]:
grouped_by_average = simulated_sample.groupby('Brand').mean().reset_index()
grouped_by_average[['Brand','Shuffled Profits']]

Unnamed: 0,Brand,Shuffled Profits
0,Off-White,492.102943
1,Yeezy,140.15873


In [147]:
grouped_by_average['Shuffled Profits'][1] - grouped_by_average['Shuffled Profits'][0]

-351.94421309530514

In [161]:
simulated_stats = make_array()

for i in np.arange(5000):
    shuffled_profits = profits.sample(frac = 1, replace = False)
    simulated_sample = pd.DataFrame({'Brand':sneakerdata['Brand'], 'Shuffled Profits': shuffled_profits['Profit']})
    grouped_by_average = simulated_sample.groupby('Brand').mean().reset_index()
    difference = grouped_by_average['Shuffled Profits'][1] - grouped_by_average['Shuffled Profits'][0]
    simulated_stats = np.append(simulated_stats, difference)

In [162]:
simulated_stats

array([-351.9442131, -351.9442131, -351.9442131, ..., -351.9442131,
       -351.9442131, -351.9442131])

Some ideas: Hypothesis test maybe for profit on the different type of brand?

P.S. Add my datascienceportfoliio tab to my github website.