# Preprocessing for ML

In [1]:
import numpy as np
import pandas as pd

### Missing data

In [2]:
volunteer = pd.read_csv('data/volunteer_opportunities.csv')
volunteer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 665 entries, 0 to 664
Data columns (total 35 columns):
opportunity_id        665 non-null int64
content_id            665 non-null int64
vol_requests          665 non-null int64
event_time            665 non-null int64
title                 665 non-null object
hits                  665 non-null int64
summary               665 non-null object
is_priority           62 non-null object
category_id           617 non-null float64
category_desc         617 non-null object
amsl                  0 non-null float64
amsl_unit             0 non-null float64
org_title             665 non-null object
org_content_id        665 non-null int64
addresses_count       665 non-null int64
locality              595 non-null object
region                665 non-null object
postalcode            659 non-null float64
primary_loc           0 non-null float64
display_url           665 non-null object
recurrence_type       665 non-null object
hours                 

In [3]:
# Check how many values are missing in the category_desc column
print(volunteer['category_desc'].isnull().sum())

volunteer_subset = volunteer[volunteer['category_desc'].notnull()]
print(volunteer_subset.shape)

48
(617, 35)


---
### Class imbalance

In [4]:
volunteer['category_desc'].value_counts()

Strengthening Communities    307
Helping Neighbors in Need    119
Education                     92
Health                        52
Environment                   32
Emergency Preparedness        15
Name: category_desc, dtype: int64

In [5]:
# Stratified sampling
from sklearn.model_selection import train_test_split

vol_X = volunteer.drop('category_desc', axis=1)
vol_y = volunteer[['category_desc']]

X_train, X_test, y_train, y_test = train_test_split(vol_X, vol_y, stratify=vol_y)

print(y_train['category_desc'].value_counts())

Strengthening Communities    230
Helping Neighbors in Need     89
Education                     69
Health                        39
Environment                   24
Emergency Preparedness        11
Name: category_desc, dtype: int64


---
### Normalization & scaling

In [6]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=3)

In [7]:
wine = pd.read_csv('data/wine_types.csv')
wine.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 14 columns):
Type                            178 non-null int64
Alcohol                         178 non-null float64
Malic acid                      178 non-null float64
Ash                             178 non-null float64
Alcalinity of ash               178 non-null float64
Magnesium                       178 non-null int64
Total phenols                   178 non-null float64
Flavanoids                      178 non-null float64
Nonflavanoid phenols            178 non-null float64
Proanthocyanins                 178 non-null float64
Color intensity                 178 non-null float64
Hue                             178 non-null float64
OD280/OD315 of diluted wines    178 non-null float64
Proline                         178 non-null int64
dtypes: float64(11), int64(3)
memory usage: 19.5 KB


In [8]:
# int --> float
wine['Magnesium'] = wine['Magnesium'].astype(float)
wine['Proline'] = wine['Proline'].astype(float)

wine.describe()

Unnamed: 0,Type,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
count,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0
mean,1.938202,13.000618,2.336348,2.366517,19.494944,99.741573,2.295112,2.02927,0.361854,1.590899,5.05809,0.957449,2.611685,746.893258
std,0.775035,0.811827,1.117146,0.274344,3.339564,14.282484,0.625851,0.998859,0.124453,0.572359,2.318286,0.228572,0.70999,314.907474
min,1.0,11.03,0.74,1.36,10.6,70.0,0.98,0.34,0.13,0.41,1.28,0.48,1.27,278.0
25%,1.0,12.3625,1.6025,2.21,17.2,88.0,1.7425,1.205,0.27,1.25,3.22,0.7825,1.9375,500.5
50%,2.0,13.05,1.865,2.36,19.5,98.0,2.355,2.135,0.34,1.555,4.69,0.965,2.78,673.5
75%,3.0,13.6775,3.0825,2.5575,21.5,107.0,2.8,2.875,0.4375,1.95,6.2,1.12,3.17,985.0
max,3.0,14.83,5.8,3.23,30.0,162.0,3.88,5.08,0.66,3.58,13.0,1.71,4.0,1680.0


In [9]:
X = wine.drop('Type', axis=1)
y = np.array(wine[['Type']]).reshape(-1)

In [10]:
# Modeling without normalizing

X_train, X_test, y_train, y_test = train_test_split(X, y)

knn.fit(X_train, y_train)
print(knn.score(X_test, y_test))

0.7111111111111111


In [11]:
# Log normalization

print(wine["Proline"].var())   # 분산

wine['Proline_log'] = np.log(wine.Proline)
print(wine['Proline_log'].var())

99166.71735542428
0.17231366191842018


In [12]:
# Import StandardScaler from scikit-learn
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()

X_scaled = ss.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y)

knn.fit(X_train, y_train)
print(knn.score(X_test, y_test))

1.0


---
### Feature engineering

In [13]:
hiking = pd.read_json('data/hiking.json')
hiking.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33 entries, 0 to 32
Data columns (total 11 columns):
Accessible        33 non-null object
Difficulty        27 non-null object
Length            29 non-null object
Limited_Access    33 non-null object
Location          33 non-null object
Name              33 non-null object
Other_Details     31 non-null object
Park_Name         33 non-null object
Prop_ID           33 non-null object
lat               0 non-null float64
lon               0 non-null float64
dtypes: float64(2), object(9)
memory usage: 2.9+ KB


In [14]:
hiking.head()

Unnamed: 0,Accessible,Difficulty,Length,Limited_Access,Location,Name,Other_Details,Park_Name,Prop_ID,lat,lon
0,Y,,0.8 miles,N,"Enter behind the Salt Marsh Nature Center, loc...",Salt Marsh Nature Trail,<p>The first half of this mile-long trail foll...,Marine Park,B057,,
1,N,Easy,1.0 mile,N,Enter Park at Lincoln Road and Ocean Avenue en...,Lullwater,Explore the Lullwater to see how nature thrive...,Prospect Park,B073,,
2,N,Easy,0.75 miles,N,Enter Park at Lincoln Road and Ocean Avenue en...,Midwood,Step back in time with a walk through Brooklyn...,Prospect Park,B073,,
3,N,Easy,0.5 miles,N,Enter Park at Lincoln Road and Ocean Avenue en...,Peninsula,Discover how the Peninsula has changed over th...,Prospect Park,B073,,
4,N,Easy,0.5 miles,N,Enter Park at Lincoln Road and Ocean Avenue en...,Waterfall,Trace the source of the Lake on the Waterfall ...,Prospect Park,B073,,


In [15]:
# convert Y/N to 1/0
from sklearn.preprocessing import LabelEncoder

enc = LabelEncoder()

hiking['Accessible_enc'] = enc.fit_transform(hiking['Accessible'])
print(hiking[['Accessible', 'Accessible_enc']].head())

  Accessible  Accessible_enc
0          Y               1
1          N               0
2          N               0
3          N               0
4          N               0


In [16]:
# convert to categorcal type
hiking['Difficulty'] = hiking['Difficulty'].astype('category')
hiking.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33 entries, 0 to 32
Data columns (total 12 columns):
Accessible        33 non-null object
Difficulty        27 non-null category
Length            29 non-null object
Limited_Access    33 non-null object
Location          33 non-null object
Name              33 non-null object
Other_Details     31 non-null object
Park_Name         33 non-null object
Prop_ID           33 non-null object
lat               0 non-null float64
lon               0 non-null float64
Accessible_enc    33 non-null int32
dtypes: category(1), float64(2), int32(1), object(8)
memory usage: 3.2+ KB


In [17]:
# count elements
hiking['Park_Name'].value_counts()

Van Cortlandt Park                  5
La Tourette Parks & Golf Course     5
Prospect Park                       4
Forest Park                         3
High Rock Park                      2
Alley Pond Park                     1
Bronx Park                          1
William T. Davis Wildlife Refuge    1
Wolfes Pond Park                    1
Willowbrook Park                    1
Clove Lakes Park                    1
Marine Park                         1
Pelham Bay Park                     1
Inwood Hill Park                    1
Conference House Park               1
Long Pond Park                      1
Deere Park                          1
Arden Woods                         1
Cunningham Park                     1
Name: Park_Name, dtype: int64

In [18]:
# one-hot encoding
category_enc = pd.get_dummies(volunteer["category_desc"])
print(category_enc.head())

   Education  Emergency Preparedness  Environment  Health  \
0          0                       0            0       0   
1          0                       0            0       0   
2          0                       0            0       0   
3          0                       0            0       0   
4          0                       0            1       0   

   Helping Neighbors in Need  Strengthening Communities  
0                          0                          0  
1                          0                          1  
2                          0                          1  
3                          0                          1  
4                          0                          0  


In [19]:
# numerical features - taking an average

In [20]:
running = pd.read_csv('data/running_times.csv')
running

Unnamed: 0,name,run1,run2,run3,run4,run5
0,Sue,20.1,18.5,19.6,20.3,18.3
1,Jane,16.5,17.1,16.9,17.6,17.3
2,Rose,23.5,25.1,25.2,24.6,23.9


In [21]:
run_columns = ['run1', 'run2', 'run3', 'run4', 'run5']

# Use apply to create a mean column
running["mean"] = running.apply(lambda row: row[run_columns].mean(), axis=1)
print(running)

   name  run1  run2  run3  run4  run5   mean
0   Sue  20.1  18.5  19.6  20.3  18.3  19.36
1  Jane  16.5  17.1  16.9  17.6  17.3  17.08
2  Rose  23.5  25.1  25.2  24.6  23.9  24.46


In [22]:
# numerical features - datetime

volunteer["start_date_converted"] = pd.to_datetime(volunteer["start_date_date"])

# Extract just the month from the converted column
volunteer["start_date_month"] = volunteer["start_date_converted"].apply(lambda row: row.month)
print(volunteer[["start_date_converted", "start_date_month"]].head())

  start_date_converted  start_date_month
0           2011-07-30                 7
1           2011-02-01                 2
2           2011-01-29                 1
3           2011-02-14                 2
4           2011-02-05                 2


### features from strings - extraction

In [23]:
import re

In [24]:
hiking["Length"].head()

0     0.8 miles
1      1.0 mile
2    0.75 miles
3     0.5 miles
4     0.5 miles
Name: Length, dtype: object

In [25]:
def return_mileage(length):
    pattern = re.compile(r"\d+\.\d+")
    
    # Search the text for matches
    mile = re.match(pattern, str(length))
    
    # If a value is returned, use group(0) to return the found value
    if mile is not None:
        return float(mile.group(0))
        
hiking["Length_num"] = hiking["Length"].apply(lambda row: return_mileage(row))
print(hiking[["Length", "Length_num"]].head())

       Length  Length_num
0   0.8 miles        0.80
1    1.0 mile        1.00
2  0.75 miles        0.75
3   0.5 miles        0.50
4   0.5 miles        0.50


### Checking for correlated features

In [26]:
wine_x = wine.drop(['Type','Proline_log'], axis=1)

# Print out the column correlations of the wine dataset
wine_x.corr()

Unnamed: 0,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
Alcohol,1.0,0.094397,0.211545,-0.310235,0.270798,0.289101,0.236815,-0.155929,0.136698,0.546364,-0.071747,0.072343,0.64372
Malic acid,0.094397,1.0,0.164045,0.2885,-0.054575,-0.335167,-0.411007,0.292977,-0.220746,0.248985,-0.561296,-0.36871,-0.192011
Ash,0.211545,0.164045,1.0,0.443367,0.286587,0.12898,0.115077,0.18623,0.009652,0.258887,-0.074667,0.003911,0.223626
Alcalinity of ash,-0.310235,0.2885,0.443367,1.0,-0.083333,-0.321113,-0.35137,0.361922,-0.197327,0.018732,-0.273955,-0.276769,-0.440597
Magnesium,0.270798,-0.054575,0.286587,-0.083333,1.0,0.214401,0.195784,-0.256294,0.236441,0.19995,0.055398,0.066004,0.393351
Total phenols,0.289101,-0.335167,0.12898,-0.321113,0.214401,1.0,0.864564,-0.449935,0.612413,-0.055136,0.433681,0.699949,0.498115
Flavanoids,0.236815,-0.411007,0.115077,-0.35137,0.195784,0.864564,1.0,-0.5379,0.652692,-0.172379,0.543479,0.787194,0.494193
Nonflavanoid phenols,-0.155929,0.292977,0.18623,0.361922,-0.256294,-0.449935,-0.5379,1.0,-0.365845,0.139057,-0.26264,-0.50327,-0.311385
Proanthocyanins,0.136698,-0.220746,0.009652,-0.197327,0.236441,0.612413,0.652692,-0.365845,1.0,-0.02525,0.295544,0.519067,0.330417
Color intensity,0.546364,0.248985,0.258887,0.018732,0.19995,-0.055136,-0.172379,0.139057,-0.02525,1.0,-0.521813,-0.428815,0.3161


---
# Exercise - UFO

In [27]:
ufo = pd.read_csv('data/ufo_sightings_large.csv', encoding='utf-8')
ufo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4935 entries, 0 to 4934
Data columns (total 11 columns):
date              4935 non-null object
city              4926 non-null object
state             4516 non-null object
country           4255 non-null object
type              4776 non-null object
seconds           4935 non-null float64
length_of_time    4792 non-null object
desc              4932 non-null object
recorded          4935 non-null object
lat               4935 non-null object
long              4935 non-null float64
dtypes: float64(2), object(9)
memory usage: 424.2+ KB


In [28]:
# Change the date column to type datetime
ufo["date"] = pd.to_datetime(ufo["date"])

In [29]:
# Drop missing data

In [30]:
# Check how many values are missing
print(ufo[['length_of_time', 'state', 'type']].isnull().sum())

# Keep only rows where length_of_time, state, and type are not null
ufo_full = ufo[ufo.length_of_time.notnull() & 
               ufo.state.notnull() & 
               ufo.type.notnull()]

print(ufo_full.shape)

length_of_time    143
state             419
type              159
dtype: int64
(4283, 11)


In [31]:
# Extracting numbers from strings

In [32]:
ufo["length_of_time"].head(10)

0                      2 weeks
1                       30sec.
2                          NaN
3              about 5 minutes
4                            2
5                   10 minutes
6    total? maybe around 10 mi
7    several sightings from 10
8                    2 minutes
9                    2 minutes
Name: length_of_time, dtype: object

In [33]:
def return_minutes(time_string):
    pattern = re.compile(r"\d+")
    num = re.match(pattern, str(time_string))
    
    pattern_time = re.compile(r"[a-z]+")
    time = re.findall(pattern_time, str(time_string))
    
    if num is None:
        return 0
    else:
        l_sec = ['seconds','sec']
        l_hrs = ['hrs','hours','hour']
        l_day = ['days']
        l_week= ['weeks','week']
        
        num_val = float(num.group(0))
        if time is not None:
            if set(time).intersection(l_sec):   # 초
                t_value = 0.5
            elif set(time).intersection(l_hrs):   # 시간
                t_value = num_val * 60
            elif set(time).intersection(l_day):   # 일
                t_value = num_val * 60 * 60
            elif set(time).intersection(l_week):   # 주
                t_value = num_val * 60 * 60 * 7
            else:
                t_value = num_val

#         print(num_val, time, t_value)
        
        return t_value
        
ufo["minutes"] = ufo["length_of_time"].apply(lambda row: return_minutes(row))

print(ufo[['length_of_time', 'minutes', 'seconds']].head(20))

               length_of_time  minutes    seconds
0                     2 weeks  50400.0  1209600.0
1                      30sec.      0.5       30.0
2                         NaN      0.0        0.0
3             about 5 minutes      0.0      300.0
4                           2      2.0        0.0
5                  10 minutes     10.0      600.0
6   total? maybe around 10 mi      0.0      600.0
7   several sightings from 10      0.0        0.0
8                   2 minutes      2.0      120.0
9                   2 minutes      2.0      120.0
10                  5 minutes      5.0      300.0
11                 10 minutes     10.0      600.0
12                      2 min      2.0      120.0
13                 30 seconds      0.5       30.0
14                      <3min      0.0      180.0
15                  5 minutes      5.0      300.0
16                 10 seconds      0.5       10.0
17                      5 sec      0.5        5.0
18                  5 minutes      5.0      300.0


In [34]:
# Identifying features for standardization

print(ufo[['seconds', 'minutes']].var())

# Log normalize the seconds column
ufo["seconds_log"] = np.log(ufo['seconds'])
print(ufo["seconds_log"].var())

seconds    3.156735e+10
minutes    1.911360e+06
dtype: float64
nan


  


In [35]:
# Encoding categorical variables

# encode us values as 1 and others as 0
ufo["country_enc"] = ufo["country"].apply(lambda val: 1 if val == 'us' else 0)
print(len(ufo['type'].unique()))

# Create a one-hot encoded set of the type values
type_set = pd.get_dummies(ufo['type'])

ufo = pd.concat([ufo, type_set], axis=1)

22


In [36]:
ufo["month"] = ufo["date"].apply(lambda row: row.month)
ufo["year"] = ufo["date"].apply(lambda row: row.year)

print(ufo[['date', 'month', 'year']].head())

                 date  month  year
0 2011-11-03 19:21:00     11  2011
1 2004-10-03 19:05:00     10  2004
2 2009-09-25 21:00:00      9  2009
3 2002-11-21 05:45:00     11  2002
4 2010-08-19 12:55:00      8  2010
