# Install pandas, numpy, scikit-learn


`pip install pandas`  
`pip install numpy`  
`pip install scikit-learn`

# Pandas Basic

In [1]:
import pandas as pd

my_list = [('isaac', 60, 50),('julie', 90, 70),('alex', 30, 40)]
header = ['name','math score','english score']
df_from_list = pd.DataFrame.from_records(my_list, columns=header)

print('create dataframe from list')
print(df_from_list)



my_dict = \
[{'name':'isaac', 'math score':60,'english score':50},
{'name':'julie', 'math score':90,'english score':70},
{'name':'alex', 'math score':30,'english score':40}
]

df_from_dict = pd.DataFrame(my_dict, columns=['name', 'math score', 'english score'])

print('create dataframe from dict')
print(df_from_dict)




create dataframe from list
    name  math score  english score
0  isaac          60             50
1  julie          90             70
2   alex          30             40
create dataframe from dict
    name  math score  english score
0  isaac          60             50
1  julie          90             70
2   alex          30             40


In [2]:
# use pandas to read csv file
df = pd.read_csv('./dataset/RegularSeasonCompactResults.csv')

# print first five row
print(df.head())

# print last five row
print(df.tail())

# statistics on the dataframe
print(df.describe())

# print max value of each column
print(df.max())

# print Wscore that is greater than 150
df[df['Wscore'] > 150]

# drop rows and reset index
df_drop_row = df.drop(df.index[0])
df_reset_index1 = df_drop_row.reset_index(drop=True)
print(df_reset_index1.head())

# drop columns
df_drop_column = df.drop('Season', axis=1)
print(df_drop_column.head())


   Season  Daynum  Wteam  Wscore  Lteam  Lscore Wloc  Numot
0    1985      20   1228      81   1328      64    N      0
1    1985      25   1106      77   1354      70    H      0
2    1985      25   1112      63   1223      56    H      0
3    1985      25   1165      70   1432      54    H      0
4    1985      25   1192      86   1447      74    H      0
        Season  Daynum  Wteam  Wscore  Lteam  Lscore Wloc  Numot
145284    2016     132   1114      70   1419      50    N      0
145285    2016     132   1163      72   1272      58    N      0
145286    2016     132   1246      82   1401      77    N      1
145287    2016     132   1277      66   1345      62    N      0
145288    2016     132   1386      87   1433      74    N      0
              Season         Daynum          Wteam         Wscore  \
count  145289.000000  145289.000000  145289.000000  145289.000000   
mean     2001.574834      75.223816    1286.720646      76.600321   
std         9.233342      33.287418     104

In [3]:
df = pd.read_csv('./dataset/RegularSeasonCompactResults.csv')

# select two origin column as new dataframe
df_new = df[['Season', 'Daynum']]
print(df_new.head())

# save dataframe to a csv file
df_new.to_csv('dfnew.csv', index=False)
#df_new.to_csv('dfnew.csv', index=False, header=False)

# apply some logical operation to manipulate data
df_daynum = df_new['Daynum'].apply(lambda x: 1 if x> 23 else 0)
print(df_daynum.head())



   Season  Daynum
0    1985      20
1    1985      25
2    1985      25
3    1985      25
4    1985      25
0    0
1    1
2    1
3    1
4    1
Name: Daynum, dtype: int64


# Numpy Basic

In [4]:
import numpy as np

a = np.array([1, 2, 3])   # Create a rank 1 array
print(type(a))            # Prints "<class 'numpy.ndarray'>"
print(a.shape)            # Prints "(3,)"
print(a[0], a[1], a[2])   # Prints "1 2 3"
a[0] = 5                  # Change an element of the array
print(a)                  # Prints "[5, 2, 3]"

b = np.array([[1,2,3],[4,5,6]])    # Create a rank 2 array
print(b.shape)                     # Prints "(2, 3)"
print(b[0, 0], b[0, 1], b[1, 0])   # Prints "1 2 4"

a = np.zeros((2,2))   # Create an array of all zeros
print(a)              # Prints "[[ 0.  0.]
                      #          [ 0.  0.]]"

b = np.ones((1,2))    # Create an array of all ones
print(b)              # Prints "[[ 1.  1.]]"

c = np.full((2,2), 7)  # Create a constant array
print(c)               # Prints "[[ 7.  7.]
                       #          [ 7.  7.]]"

d = np.eye(2)         # Create a 2x2 identity matrix
print(d)              # Prints "[[ 1.  0.]
                      #          [ 0.  1.]]"

e = np.random.random((2,2))  # Create an array filled with random values
print(e)                     # Might print "[[ 0.91940167  0.08143941]
                             #               [ 0.68744134  0.87236687]]"


<class 'numpy.ndarray'>
(3,)
1 2 3
[5 2 3]
(2, 3)
1 2 4
[[0. 0.]
 [0. 0.]]
[[1. 1.]]
[[7 7]
 [7 7]]
[[1. 0.]
 [0. 1.]]
[[0.13219466 0.58185592]
 [0.90149434 0.89322365]]


ref: http://cs231n.github.io/python-numpy-tutorial/#numpy

# Load CSV file

In [5]:
import pandas as pd

# load csv file
df = pd.read_csv('./dataset/demo.csv')

# load csv file without header
#df = pd.read_csv('demo.csv', header=None)

# print dataframe
print(df)

# print dataframe shape
print(df.shape)

# print column
print(df['number_room'].values)

   size  number_room  house_price house_type
0  40.0          3.0          800        old
1  29.0          5.0          700      young
2  33.0          2.0          670      young
3   NaN          2.0          770        old
4   NaN          NaN          870      young
(5, 4)
[ 3.  5.  2.  2. nan]


# Missing value

In [6]:
import pandas as pd

# load csv file
df = pd.read_csv('./dataset/demo.csv')

print('origin dataframe')
print(df)

print('# of nan value in each columns:')
print(df.isnull().sum())

print('drop row that contain any missing value')
# drop row that contain any missing value
df_no_missing = df.dropna()
print(df_no_missing)

print('fill missing value with mean')
# fill missing value with mean 
df["size"].fillna(df["size"].mean(), inplace=True)
print(df)

origin dataframe
   size  number_room  house_price house_type
0  40.0          3.0          800        old
1  29.0          5.0          700      young
2  33.0          2.0          670      young
3   NaN          2.0          770        old
4   NaN          NaN          870      young
# of nan value in each columns:
size           2
number_room    1
house_price    0
house_type     0
dtype: int64
drop row that contain any missing value
   size  number_room  house_price house_type
0  40.0          3.0          800        old
1  29.0          5.0          700      young
2  33.0          2.0          670      young
fill missing value with mean
   size  number_room  house_price house_type
0  40.0          3.0          800        old
1  29.0          5.0          700      young
2  33.0          2.0          670      young
3  34.0          2.0          770        old
4  34.0          NaN          870      young


# Encoding categorical features

In [7]:
import pandas as pd

# load csv file
df = pd.read_csv('./dataset/demo.csv')

print('origin dataframe')
print(df)

print('encode category')
df['house_type'] = pd.Categorical(df['house_type']).codes
print(df)

print('\n\n=============\n\n')

df = pd.read_csv('./dataset/demo.csv')
print('origin dataframe')
print(df)
df = pd.get_dummies(df)
print(df)


origin dataframe
   size  number_room  house_price house_type
0  40.0          3.0          800        old
1  29.0          5.0          700      young
2  33.0          2.0          670      young
3   NaN          2.0          770        old
4   NaN          NaN          870      young
encode category
   size  number_room  house_price  house_type
0  40.0          3.0          800           0
1  29.0          5.0          700           1
2  33.0          2.0          670           1
3   NaN          2.0          770           0
4   NaN          NaN          870           1




origin dataframe
   size  number_room  house_price house_type
0  40.0          3.0          800        old
1  29.0          5.0          700      young
2  33.0          2.0          670      young
3   NaN          2.0          770        old
4   NaN          NaN          870      young
   size  number_room  house_price  house_type_old  house_type_young
0  40.0          3.0          800               1             

# Change dataframe into numpy arrray

In [8]:
import pandas as pd
import numpy as np
# load csv file
df = pd.read_csv('./dataset/demo.csv')

print('change dataframe to numpy array')
numpy_array = np.array(df)
print(numpy_array)

print('change numpy array to dataframe')
df_from_numpy = pd.DataFrame(numpy_array)
print(df_from_numpy)


change dataframe to numpy array
[[40.0 3.0 800 'old']
 [29.0 5.0 700 'young']
 [33.0 2.0 670 'young']
 [nan 2.0 770 'old']
 [nan nan 870 'young']]
change numpy array to dataframe
      0    1    2      3
0  40.0  3.0  800    old
1  29.0  5.0  700  young
2  33.0  2.0  670  young
3   NaN  2.0  770    old
4   NaN  NaN  870  young


# Split data

In [9]:
import numpy as np
from sklearn.model_selection import train_test_split

x, y = np.arange(20).reshape((10, 2)), np.arange(10)

print('before splitting......')

print("x: {}\n".format(x))
print("y: {}\n".format(y))

print("shape of x: {}".format(x.shape))
print("shape of y: {}\n".format(y.shape))

# random_state = 1
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)
print('after splitting......')

print("x_train: {}\n".format(x_train))
print("x_test: {}\n".format(x_test))

print("y_train: {}\n".format(y_train))
print("y_test: {}\n".format(y_test))

before splitting......
x: [[ 0  1]
 [ 2  3]
 [ 4  5]
 [ 6  7]
 [ 8  9]
 [10 11]
 [12 13]
 [14 15]
 [16 17]
 [18 19]]

y: [0 1 2 3 4 5 6 7 8 9]

shape of x: (10, 2)
shape of y: (10,)

after splitting......
x_train: [[16 17]
 [18 19]
 [ 4  5]
 [ 6  7]
 [14 15]
 [ 8  9]
 [10 11]]

x_test: [[12 13]
 [ 2  3]
 [ 0  1]]

y_train: [8 9 2 3 7 4 5]

y_test: [6 1 0]



# Preprocessing Data

## Standardize data into zero mean and unit std

In [10]:
from sklearn import preprocessing
import numpy as np

x_train = np.array([[ 100., -1.,  2.],
                    [ 900.,  0.,  0.],
                    [ 200.,  1., -1.]])


print("mean of x_train: {}".format(x_train.mean(axis=0)))
print("std of x_train: {}\n".format(x_train.std(axis=0)))


scaler = preprocessing.StandardScaler().fit(x_train)

print("mean of x_scale: {}".format(scaler.mean_))
print("std of x_scale: {}\n".format(scaler.scale_))

# apply mean and std to standardize data
x_train = scaler.transform(x_train)

print("after standardiztion......")
print('x_train: {}'.format(x_train))


x_test = np.array([[-1., 1., 0.]])
print("apply same mean and std to new data(test data)\n")

x_test = scaler.transform(x_test)
print('x_test: {}'.format(x_test))



mean of x_train: [4.00000000e+02 0.00000000e+00 3.33333333e-01]
std of x_train: [355.9026084    0.81649658   1.24721913]

mean of x_scale: [4.00000000e+02 0.00000000e+00 3.33333333e-01]
std of x_scale: [355.9026084    0.81649658   1.24721913]

after standardiztion......
x_train: [[-0.84292723 -1.22474487  1.33630621]
 [ 1.40487872  0.         -0.26726124]
 [-0.56195149  1.22474487 -1.06904497]]
apply same mean and std to new data(test data)

x_test: [[-1.12671273  1.22474487 -0.26726124]]


## Standardize data into a range

In [11]:
from sklearn import preprocessing
import numpy as np

x_train = np.array([[ 1., -1.,  2.],
                    [ 2.,  0.,  0.],
                    [ 0.,  1., -1.]])

scaler = preprocessing.MinMaxScaler().fit(x_train)
x_train = scaler.transform(x_train)

print("after standardiztion......")
print('x_train: {}'.format(x_train))


x_test = np.array([[ -3., -1.,  4.]])
print("apply same transformation to new data(test data)\n")

x_test = scaler.transform(x_test)
print('x_test: {}'.format(x_test))

after standardiztion......
x_train: [[0.5        0.         1.        ]
 [1.         0.5        0.33333333]
 [0.         1.         0.        ]]
apply same transformation to new data(test data)

x_test: [[-1.5         0.          1.66666667]]


# Evaluate Result

In [12]:
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, confusion_matrix

y_test = [0, 1, 0 , 1, 0]
y_pred = [1, 0, 0 , 1, 0]

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
num_correct_samples = accuracy_score(y_test, y_pred, normalize=False)
con_matrix = confusion_matrix(y_test, y_pred)


print("Mean squared error: {}".format(mse))
print('r2 score: {}'.format(r2))
print('number of correct sample: {}'.format(num_correct_samples))
print('accuracy: {}'.format(accuracy))
print('confusion matrix: {}'.format(con_matrix))

Mean squared error: 0.4
r2 score: -0.6666666666666665
number of correct sample: 3
accuracy: 0.6
confusion matrix: [[2 1]
 [1 1]]


# Kaggle Introduction

kaggle website: https://www.kaggle.com/  
kaggle api: https://github.com/Kaggle/kaggle-api