In [26]:
import pandas as pd
import numpy as np

# for Box-Cox Transformation (Normalization)
from scipy import stats

# for scaling
from mlxtend.preprocessing import minmax_scaling

# plotting modules
import seaborn as sns
import matplotlib.pyplot as plt

In [27]:
kickstarters_2018 = pd.read_csv('ks-projects-201801.csv')

In [28]:
original_goal_real = pd.DataFrame(kickstarters_2018.usd_goal_real)
scaled_goal_real = minmax_scaling(original_goal_real, columns = ['usd_goal_real'])

print('Original Goal\nPreviw:\n', original_goal_real.head())
print('Minimun value:', float(original_goal_real.min()), '\nMaximun Value:', float(original_goal_real.max()))
print('_' * 30)
print('Scaled Goal\nPreviw:\n', scaled_goal_real.head())
print('Minimun value:', float(scaled_goal_real.min()), '\nMaximun Value:', float(scaled_goal_real.max()))

Original Goal
Previw:
    usd_goal_real
0        1533.95
1       30000.00
2       45000.00
3        5000.00
4       19500.00
Minimun value: 0.01 
Maximun Value: 166361390.7
______________________________
Scaled Goal
Previw:
    usd_goal_real
0       0.000009
1       0.000180
2       0.000270
3       0.000030
4       0.000117
Minimun value: 0.0 
Maximun Value: 1.0


  print('Minimun value:', float(original_goal_real.min()), '\nMaximun Value:', float(original_goal_real.max()))
  print('Minimun value:', float(scaled_goal_real.min()), '\nMaximun Value:', float(scaled_goal_real.max()))


In [29]:
print(kickstarters_2018['usd_goal_real'].head())
# print(kickstarters_2018.shape)

# drop the column
kickstarters_2018 = kickstarters_2018.drop(columns = ['usd_goal_real'])
# print(kickstarters_2018.shape)

# add scaled column back to df
kickstarters_2018['usd_goal_real'] = scaled_goal_real.usd_goal_real
# print(kickstarters_2018.shape)
print(kickstarters_2018['usd_goal_real'].head())

0     1533.95
1    30000.00
2    45000.00
3     5000.00
4    19500.00
Name: usd_goal_real, dtype: float64
0    0.000009
1    0.000180
2    0.000270
3    0.000030
4    0.000117
Name: usd_goal_real, dtype: float64


In [30]:
original_goal = pd.DataFrame(kickstarters_2018.goal)
scaled_goal = minmax_scaling(original_goal, columns = ['goal'])
# print(original_goal.head())
# print(scaled_goal.head())

In [31]:
# print(kickstarters_2018['goal'].head())
kickstarters_2018 = kickstarters_2018.drop(columns = ['goal'])
kickstarters_2018['goal'] = scaled_goal
# print(kickstarters_2018['goal'].head())

In [32]:
index_positive_pledged_real = kickstarters_2018.usd_pledged_real > 0
print(index_positive_pledged_real.head(15)) # for chech the index
positive_pledged_real = kickstarters_2018.usd_pledged_real.loc[index_positive_pledged_real]
normalized_pledged_real = pd.Series(stats.boxcox(positive_pledged_real)[0], name = 'usd_pledged_real'
                                    , index = positive_pledged_real.index)
print(normalized_pledged_real.head(15)) # for chech the index

0     False
1      True
2      True
3      True
4      True
5      True
6      True
7      True
8      True
9      True
10    False
11     True
12    False
13    False
14     True
Name: usd_pledged_real, dtype: bool
1     10.165142
2      6.468598
3      0.000000
4      9.129277
5     15.836853
6      9.029275
7      7.521421
8     12.287535
9     11.792075
11    13.080869
14    17.605187
15     8.099304
16     7.317866
17     8.364741
18     6.651330
Name: usd_pledged_real, dtype: float64


In [33]:
# print(kickstarters_2018.shape)
kickstarters_2018 = kickstarters_2018.drop(columns = ['usd_pledged_real'])
kickstarters_2018 = kickstarters_2018.join(normalized_pledged_real)
# print(kickstarters_2018.shape)
# print(kickstarters_2018['usd_pledged_real'].head(15)) # check if index match
kickstarters_2018 = kickstarters_2018.fillna(0)
print(kickstarters_2018['usd_pledged_real'].head(15))

0      0.000000
1     10.165142
2      6.468598
3      0.000000
4      9.129277
5     15.836853
6      9.029275
7      7.521421
8     12.287535
9     11.792075
10     0.000000
11    13.080869
12     0.000000
13     0.000000
14    17.605187
Name: usd_pledged_real, dtype: float64


In [35]:
index_positive_pledged = kickstarters_2018.pledged > 0
# print(index_positive_pledged.head(15))
positive_pledged = kickstarters_2018.pledged.loc[index_positive_pledged]
normalized_pledged = pd.Series(stats.boxcox(positive_pledged)[0], name = 'pledged', index = positive_pledged.index)
# print(normalized_pledged.head(15))

# print(kickstarters_2018.shape)
kickstarters_2018 = kickstarters_2018.drop(columns = ['pledged'])
# print(kickstarters_2018.shape)
kickstarters_2018 = kickstarters_2018.join(normalized_pledged)
# print(kickstarters_2018.shape)
kickstarters_2018 = kickstarters_2018.fillna(0)
print(kickstarters_2018.pledged.head(15))

0      0.000000
1      6.898061
2      4.396085
3      0.000000
4      6.217916
5     10.416919
6      6.151486
7      5.131480
8      8.251373
9      7.939852
10     0.000000
11     8.745100
12     0.000000
13     0.000000
14    11.137003
Name: pledged, dtype: float64


In [36]:
kickstarters_2018.to_csv('ks-projects-201801.csv', index = False)