In [1]:
import pandas as pd
import numpy as np

In [2]:
raw_data = {'first_name': ['Jason', np.nan, 'Tina', 'Jake', 'Amy'],
        'last_name': ['Miller', np.nan, 'Ali', 'Milner', 'Cooze'],
        'age': [42, np.nan, 36, 24, 73],
        'sex': ['m', np.nan, 'f', 'm', 'f'],
        'preTestScore': [4, np.nan, np.nan, 2, 3],
        'postTestScore': [25, np.nan, np.nan, 62, 70]}
df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'age', 'sex', 'preTestScore', 'postTestScore'])
df

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
1,,,,,,
2,Tina,Ali,36.0,f,,
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


In [4]:
df.isnull().sum()/len(df)

first_name       0.2
last_name        0.2
age              0.2
sex              0.2
preTestScore     0.4
postTestScore    0.4
dtype: float64

In [6]:
df_no_missing = df.dropna()
df_no_missing

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


In [7]:
df_cleaned = df.dropna(how="all")
df_cleaned

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
2,Tina,Ali,36.0,f,,
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


In [8]:
df

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
1,,,,,,
2,Tina,Ali,36.0,f,,
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


In [9]:
df['location'] = np.nan
df

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore,location
0,Jason,Miller,42.0,m,4.0,25.0,
1,,,,,,,
2,Tina,Ali,36.0,f,,,
3,Jake,Milner,24.0,m,2.0,62.0,
4,Amy,Cooze,73.0,f,3.0,70.0,


In [10]:
df.dropna(axis=1, how='all')

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
1,,,,,,
2,Tina,Ali,36.0,f,,
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


In [15]:
df.dropna(axis=0, thresh=1)

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore,location
0,Jason,Miller,42.0,m,4.0,25.0,
2,Tina,Ali,36.0,f,,,
3,Jake,Milner,24.0,m,2.0,62.0,
4,Amy,Cooze,73.0,f,3.0,70.0,


In [17]:
df.dropna(thresh=5)

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore,location
0,Jason,Miller,42.0,m,4.0,25.0,
3,Jake,Milner,24.0,m,2.0,62.0,
4,Amy,Cooze,73.0,f,3.0,70.0,


In [19]:
df.fillna(0)

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore,location
0,Jason,Miller,42.0,m,4.0,25.0,0.0
1,0,0,0.0,0,0.0,0.0,0.0
2,Tina,Ali,36.0,f,0.0,0.0,0.0
3,Jake,Milner,24.0,m,2.0,62.0,0.0
4,Amy,Cooze,73.0,f,3.0,70.0,0.0


In [22]:
df["preTestScore"].mean()
df["preTestScore"].median()
df["preTestScore"].mode()

0    2.0
1    3.0
2    4.0
dtype: float64

In [23]:
df["preTestScore"].fillna(df["preTestScore"].mean(), inplace=True)

In [24]:
df

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore,location
0,Jason,Miller,42.0,m,4.0,25.0,
1,,,,,3.0,,
2,Tina,Ali,36.0,f,3.0,,
3,Jake,Milner,24.0,m,2.0,62.0,
4,Amy,Cooze,73.0,f,3.0,70.0,


In [25]:
df.groupby("sex")["postTestScore"].transform("mean")

0    43.5
1     NaN
2    70.0
3    43.5
4    70.0
Name: postTestScore, dtype: float64

In [27]:
df["postTestScore"].fillna(
    df.groupby("sex")["postTestScore"].transform("mean"), inplace=True)

In [28]:
df

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore,location
0,Jason,Miller,42.0,m,4.0,25.0,
1,,,,,3.0,,
2,Tina,Ali,36.0,f,3.0,70.0,
3,Jake,Milner,24.0,m,2.0,62.0,
4,Amy,Cooze,73.0,f,3.0,70.0,


In [30]:
df[df['age'].notnull() & df['sex'].notnull()]

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore,location
0,Jason,Miller,42.0,m,4.0,25.0,
2,Tina,Ali,36.0,f,3.0,70.0,
3,Jake,Milner,24.0,m,2.0,62.0,
4,Amy,Cooze,73.0,f,3.0,70.0,


In [32]:
# Categorical Data
edges = pd.DataFrame({'source': [0,1,2],
                  'target':[2,2,3],
                  'weight': [3,4,5],
                  'color': ['red', 'blue','blue']})
edges

Unnamed: 0,color,source,target,weight
0,red,0,2,3
1,blue,1,2,4
2,blue,2,3,5


In [33]:
edges.dtypes

color     object
source     int64
target     int64
weight     int64
dtype: object

In [34]:
edges["color"]

0     red
1    blue
2    blue
Name: color, dtype: object

In [35]:
pd.get_dummies(edges)

Unnamed: 0,source,target,weight,color_blue,color_red
0,0,2,3,0,1
1,1,2,4,1,0
2,2,3,5,1,0


In [36]:
pd.get_dummies(edges["color"])

Unnamed: 0,blue,red
0,0,1
1,1,0
2,1,0


In [39]:
weight_dict = {3:'M', 4:'L', 5:'XL'}
edges["weigh_sign"] = edges["weight"].map(weight_dict)
edges

Unnamed: 0,color,source,target,weight,wegiht_sign,weigh_sign
0,red,0,2,3,M,M
1,blue,1,2,4,L,L
2,blue,2,3,5,XL,XL


In [40]:
weight_sign = pd.get_dummies(edges["wegiht_sign"])
weight_sign

Unnamed: 0,L,M,XL
0,0,1,0
1,1,0,0
2,0,0,1


In [47]:
pd.get_dummies(edges).values

array([[0, 2, 3, 0, 1, 0, 1, 0, 0, 1, 0],
       [1, 2, 4, 1, 0, 1, 0, 0, 1, 0, 0],
       [2, 3, 5, 1, 0, 0, 0, 1, 0, 0, 1]], dtype=int64)

In [48]:
# Data Binning

raw_data = {'regiment': ['Nighthawks', 'Nighthawks', 'Nighthawks', 'Nighthawks', 'Dragoons', 'Dragoons', 'Dragoons', 'Dragoons', 'Scouts', 'Scouts', 'Scouts', 'Scouts'],
        'company': ['1st', '1st', '2nd', '2nd', '1st', '1st', '2nd', '2nd','1st', '1st', '2nd', '2nd'],
        'name': ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze', 'Jacon', 'Ryaner', 'Sone', 'Sloan', 'Piger', 'Riani', 'Ali'],
        'preTestScore': [4, 24, 31, 2, 3, 4, 24, 31, 2, 3, 2, 3],
        'postTestScore': [25, 94, 57, 62, 70, 25, 94, 57, 62, 70, 62, 70]}
df = pd.DataFrame(raw_data, columns = ['regiment', 'company', 'name', 'preTestScore', 'postTestScore'])
df

Unnamed: 0,regiment,company,name,preTestScore,postTestScore
0,Nighthawks,1st,Miller,4,25
1,Nighthawks,1st,Jacobson,24,94
2,Nighthawks,2nd,Ali,31,57
3,Nighthawks,2nd,Milner,2,62
4,Dragoons,1st,Cooze,3,70
5,Dragoons,1st,Jacon,4,25
6,Dragoons,2nd,Ryaner,24,94
7,Dragoons,2nd,Sone,31,57
8,Scouts,1st,Sloan,2,62
9,Scouts,1st,Piger,3,70


In [57]:
bins = [i for i in range(0,101, 25)]

group_names = ["Low", "okay", "Good", "Great"]
categories = pd.cut(df['postTestScore'], bins, labels = group_names)
categories

0       Low
1     Great
2      Good
3      Good
4      Good
5       Low
6     Great
7      Good
8      Good
9      Good
10     Good
11     Good
Name: postTestScore, dtype: category
Categories (4, object): [Low < okay < Good < Great]

In [59]:
# using scikit-learn
raw_example = df.as_matrix()
raw_example[:3]

array([['Nighthawks', '1st', 'Miller', 4, 25],
       ['Nighthawks', '1st', 'Jacobson', 24, 94],
       ['Nighthawks', '2nd', 'Ali', 31, 57]], dtype=object)

In [60]:
data = raw_example.copy()

In [62]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le

LabelEncoder()

In [66]:
raw_example

array([['Nighthawks', '1st', 'Miller', 4, 25],
       ['Nighthawks', '1st', 'Jacobson', 24, 94],
       ['Nighthawks', '2nd', 'Ali', 31, 57],
       ['Nighthawks', '2nd', 'Milner', 2, 62],
       ['Dragoons', '1st', 'Cooze', 3, 70],
       ['Dragoons', '1st', 'Jacon', 4, 25],
       ['Dragoons', '2nd', 'Ryaner', 24, 94],
       ['Dragoons', '2nd', 'Sone', 31, 57],
       ['Scouts', '1st', 'Sloan', 2, 62],
       ['Scouts', '1st', 'Piger', 3, 70],
       ['Scouts', '2nd', 'Riani', 2, 62],
       ['Scouts', '2nd', 'Ali', 3, 70]], dtype=object)

In [67]:
le.fit(raw_example[:,0])

LabelEncoder()

In [68]:
le.classes_

array(['Dragoons', 'Nighthawks', 'Scouts'], dtype=object)

In [69]:
le.transform(raw_example[:,0])

array([1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 2, 2], dtype=int64)

In [75]:
data[:,0] = le.transform(raw_example[:,0])
data[:,]

NotFittedError: This LabelEncoder instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.

In [76]:
label_column = [0,1,2,5]
label_encoder_list = []
for column_index in label_column:
    le = preprocessing.LabelEncoder()
    le.fit(raw_example[:, column_index])
    data[:, column_index] = le.transform(raw_example[:, column_index])
    label_encoder_list.append(le)
    del le
data[:3]

IndexError: index 5 is out of bounds for axis 1 with size 5

In [77]:
# feature scaling
df = pd.DataFrame({'A':[14.00,90.20,90.95,96.27,91.21],'B':[103.02,107.26,110.35,114.23,114.68], 'C':['big','small','big','small','small']})
df

Unnamed: 0,A,B,C
0,14.0,103.02,big
1,90.2,107.26,small
2,90.95,110.35,big
3,96.27,114.23,small
4,91.21,114.68,small


In [78]:
df["A"]-df["A"].min()

0     0.00
1    76.20
2    76.95
3    82.27
4    77.21
Name: A, dtype: float64

In [79]:
(df["A"]-df["A"].min())/(df["A"].max()-df["A"].min())

0    0.000000
1    0.926219
2    0.935335
3    1.000000
4    0.938495
Name: A, dtype: float64

In [80]:
df["A"] = (df["A"]-df["A"].min())
/(df["A"].max()-df["A"].min())*(5-1)+1
df

Unnamed: 0,A,B,C
0,0.0,103.02,big
1,76.2,107.26,small
2,76.95,110.35,big
3,82.27,114.23,small
4,77.21,114.68,small


In [81]:
df["B"] = (df["B"]-df["B"].mean()) / (df["B"].std())

In [82]:
df

Unnamed: 0,A,B,C
0,0.0,-1.40525,big
1,76.2,-0.54023,small
2,76.95,0.090174,big
3,82.27,0.881749,small
4,77.21,0.973556,small


In [84]:
def feature_scaling(df, scaling_strategy ="min-max", column =None):
    if column == None:
        column = [column_name for column_name in df.columns]
    for column_name in column:
        if scaling_strategy == "min-max":
            df[column_name] = (df[column_name] - df[column_name].min()) /\
                                (df[column_name].max() - df[column_name].min())
        elif scaling_strategy == "z-score":
            df[column_name] = (df[column_name] - \
                              df[column_name].mean()) /\
                                (df[column_name].std())
    return df

In [85]:

df = pd.DataFrame({'A':[14.00,90.20,90.95,96.27,91.21],'B':[103.02,107.26,110.35,114.23,114.68], 'C':['big','small','big','small','small']})
df

Unnamed: 0,A,B,C
0,14.0,103.02,big
1,90.2,107.26,small
2,90.95,110.35,big
3,96.27,114.23,small
4,91.21,114.68,small


In [86]:
feature_scaling(df, column=["A","B"])

Unnamed: 0,A,B,C
0,0.0,0.0,big
1,0.926219,0.363636,small
2,0.935335,0.628645,big
3,1.0,0.961407,small
4,0.938495,1.0,small


In [87]:
#
df = pd.io.parsers.read_csv(
    'https://raw.githubusercontent.com/rasbt/pattern_classification/master/data/wine_data.csv',
     header=None,
     usecols=[0,1,2]
    )

df.columns=['Class label', 'Alcohol', 'Malic acid']

df.head()

Unnamed: 0,Class label,Alcohol,Malic acid
0,1,14.23,1.71
1,1,13.2,1.78
2,1,13.16,2.36
3,1,14.37,1.95
4,1,13.24,2.59


In [88]:
df = feature_scaling(df, 'min-max', column=["Alcohol", "Malic acid"])
df.head()

Unnamed: 0,Class label,Alcohol,Malic acid
0,1,0.842105,0.1917
1,1,0.571053,0.205534
2,1,0.560526,0.320158
3,1,0.878947,0.23913
4,1,0.581579,0.365613


In [90]:
std_scaler  = preprocessing.StandardScaler().fit(df[["Alcohol", "Malic acid"]])

In [91]:
df_std = std_scaler.transform(df[["Alcohol", "Malic acid"]])
df_std

array([[ 1.51861254, -0.5622498 ],
       [ 0.24628963, -0.49941338],
       [ 0.19687903,  0.02123125],
       [ 1.69154964, -0.34681064],
       [ 0.29570023,  0.22769377],
       [ 1.48155459, -0.51736664],
       [ 1.71625494, -0.4186237 ],
       [ 1.3086175 , -0.16727801],
       [ 2.25977152, -0.62508622],
       [ 1.0615645 , -0.88540853],
       [ 1.3580281 , -0.15830138],
       [ 1.38273339, -0.76871232],
       [ 0.92568536, -0.54429654],
       [ 2.16095032, -0.54429654],
       [ 1.70390229, -0.4186237 ],
       [ 0.77745356, -0.47248348],
       [ 1.60508109, -0.37374054],
       [ 1.02450655, -0.68792264],
       [ 1.46920194, -0.66996938],
       [ 0.78980621,  0.68550197],
       [ 1.3086175 , -0.63406285],
       [-0.08723191,  1.31386618],
       [ 0.87627476, -0.42760033],
       [-0.18605311, -0.66099274],
       [ 0.61686912, -0.47248348],
       [ 0.06099988, -0.25704433],
       [ 0.48098997, -0.50839001],
       [ 0.36981612, -0.55327317],
       [ 1.07391715,

In [92]:
minmax_scaler = preprocessing.MinMaxScaler().fit(df[["Alcohol", "Malic acid"]])

In [93]:
minmax_scaler.transform(df[["Alcohol", "Malic acid"]])

array([[0.84210526, 0.1916996 ],
       [0.57105263, 0.2055336 ],
       [0.56052632, 0.3201581 ],
       [0.87894737, 0.23913043],
       [0.58157895, 0.36561265],
       [0.83421053, 0.20158103],
       [0.88421053, 0.22332016],
       [0.79736842, 0.27865613],
       [1.        , 0.17786561],
       [0.74473684, 0.12055336],
       [0.80789474, 0.28063241],
       [0.81315789, 0.14624506],
       [0.71578947, 0.19565217],
       [0.97894737, 0.19565217],
       [0.88157895, 0.22332016],
       [0.68421053, 0.21146245],
       [0.86052632, 0.23320158],
       [0.73684211, 0.16403162],
       [0.83157895, 0.16798419],
       [0.68684211, 0.46640316],
       [0.79736842, 0.17588933],
       [0.5       , 0.60474308],
       [0.70526316, 0.22134387],
       [0.47894737, 0.16996047],
       [0.65      , 0.21146245],
       [0.53157895, 0.25889328],
       [0.62105263, 0.20355731],
       [0.59736842, 0.19367589],
       [0.74736842, 0.22924901],
       [0.78684211, 0.18577075],
       [0.

NameError: name 'df_minmax' is not defined