# Trần Triệu Tuân - ID: 1902023


#**Preprocessing for Machine Learning in Python from DataCamp**
#Chapter 3: Feature Engineering

**----------------------------------------------------------------------**

# Set up and training data

In [None]:
import pandas as pd
import numpy as np

**Loading and scale hiking.json data on datacamp**

In [None]:
hiking = pd.read_json("https://assets.datacamp.com/production/repositories/1816/datasets/4f26c48451bdbf73db8a58e226cd3d6b45cf7bb5/hiking.json")
hiking.dropna(subset = ["Length"], inplace=True)
hiking.head()

Unnamed: 0,Prop_ID,Name,Location,Park_Name,Length,Difficulty,Other_Details,Accessible,Limited_Access,lat,lon
0,B057,Salt Marsh Nature Trail,"Enter behind the Salt Marsh Nature Center, loc...",Marine Park,0.8 miles,,<p>The first half of this mile-long trail foll...,Y,N,,
1,B073,Lullwater,Enter Park at Lincoln Road and Ocean Avenue en...,Prospect Park,1.0 mile,Easy,Explore the Lullwater to see how nature thrive...,N,N,,
2,B073,Midwood,Enter Park at Lincoln Road and Ocean Avenue en...,Prospect Park,0.75 miles,Easy,Step back in time with a walk through Brooklyn...,N,N,,
3,B073,Peninsula,Enter Park at Lincoln Road and Ocean Avenue en...,Prospect Park,0.5 miles,Easy,Discover how the Peninsula has changed over th...,N,N,,
4,B073,Waterfall,Enter Park at Lincoln Road and Ocean Avenue en...,Prospect Park,0.5 miles,Easy,Trace the source of the Lake on the Waterfall ...,N,N,,


**Loading and scale volunteer.csv data on datacamp**

In [None]:
volunteer = pd.read_csv("https://assets.datacamp.com/production/repositories/1816/datasets/668b96955d8b252aa8439c7602d516634e3f015e/volunteer_opportunities.csv")
# drop NaN values of category_desc column
volunteer.dropna(subset = ["category_desc"], inplace=True)
# print head of dataset
volunteer.head()

Unnamed: 0,opportunity_id,content_id,vol_requests,event_time,title,hits,summary,is_priority,category_id,category_desc,amsl,amsl_unit,org_title,org_content_id,addresses_count,locality,region,postalcode,primary_loc,display_url,recurrence_type,hours,created_date,last_modified_date,start_date_date,end_date_date,status,Latitude,Longitude,Community Board,Community Council,Census Tract,BIN,BBL,NTA
1,5008,37036,2,0,Web designer,22,Build a website for an Afghan business,,1.0,Strengthening Communities,,,Bpeace,37026,1,"5 22nd St\nNew York, NY 10010\n(40.74053152272...",NY,10010.0,,/opportunities/5008,onetime,0,January 14 2011,January 25 2011,February 01 2011,February 01 2011,approved,,,,,,,,
2,5016,37143,20,0,Urban Adventures - Ice Skating at Lasker Rink,62,Please join us and the students from Mott Hall...,,1.0,Strengthening Communities,,,Street Project,3001,1,,NY,10026.0,,/opportunities/5016,onetime,0,January 19 2011,January 21 2011,January 29 2011,January 29 2011,approved,,,,,,,,
3,5022,37237,500,0,Fight global hunger and support women farmers ...,14,The Oxfam Action Corps is a group of dedicated...,,1.0,Strengthening Communities,,,Oxfam America,2170,1,,NY,2114.0,,/opportunities/5022,ongoing,0,January 21 2011,January 25 2011,February 14 2011,March 31 2012,approved,,,,,,,,
4,5055,37425,15,0,Stop 'N' Swap,31,Stop 'N' Swap reduces NYC's waste by finding n...,,4.0,Environment,,,Office of Recycling Outreach and Education,36773,1,,NY,10455.0,,/opportunities/5055,onetime,0,January 28 2011,February 01 2011,February 05 2011,February 05 2011,approved,,,,,,,,
5,5056,37426,15,0,Queens Stop 'N' Swap,135,Stop 'N' Swap reduces NYC's waste by finding n...,,4.0,Environment,,,Office of Recycling Outreach and Education,36773,1,,NY,11372.0,,/opportunities/5056,onetime,0,January 28 2011,January 28 2011,February 12 2011,February 12 2011,approved,,,,,,,,


# Encoding categorical variables - binary


*   Take a look at the **hiking** dataset. There are several columns here that need encoding, one of which is the **Accessible** column, which needs to be encoded in order to be modeled. **Accessible** is a binary feature, so it has two values - either Y or N - so it needs to be encoded into 1s and 0s. Use **scikit-learn's LabelEncoder**  method to do that transformation.




In [None]:
from sklearn import preprocessing
enc = preprocessing.LabelEncoder()

# Apply the encoding to the "Accessible" column
hiking['Accessible_enc'] = enc.fit_transform(hiking.Accessible)

# Compare the two columns
print(hiking[['Accessible', 'Accessible_enc']].head())

  Accessible  Accessible_enc
0          Y               1
1          N               0
2          N               0
3          N               0
4          N               0


# Encoding categorical variables - one-hot


*   One of the columns in the **volunteer** dataset, **category_desc**, gives 
category descriptions for the volunteer opportunities listed. Because it is a categorical variable with more than two categories, we need to use one-hot encoding to transform this column numerically. Use *Pandas'* **get_dummies()**function to do so.








In [None]:
# Transform the category_desc column
category_enc = pd.get_dummies(volunteer["category_desc"])

# Take a look at the encoded columns
print(category_enc.head())

   Education  ...  Strengthening Communities
1          0  ...                          1
2          0  ...                          1
3          0  ...                          1
4          0  ...                          0
5          0  ...                          0

[5 rows x 6 columns]


#3.3.1 Engineering numerical features – taking an average


* A good use case for taking an aggregate statistic to create a new feature is to take the mean of columns. Here, you have a **DataFrame** of running times named **running_times_5k**. For each name in the dataset, take the mean of their 5 run times.



In [None]:
# Create a list of the columns to average
run_columns = ['run1', 'run2', 'run3', 'run4', 'run5']

# Use apply to create a mean column
running_times_5k["mean"] = running_times_5k.apply(lambda row: row[run_columns].mean(), axis=1)

# Take a look at the results
print(running_times_5k)

NameError: ignored

# 3.3.2 Engineering numerical features – datetime


* There are several columns in the volunteer dataset comprised of datetimes. Let’s take a look at the **start_date_date** column and extract just the month to use as a feature for modeling.



In [None]:
volunteer["start_date_converted"] = pd.to_datetime(volunteer["start_date_date"])
 
# Extract just the month from the converted column
volunteer["start_date_month"] = volunteer["start_date_converted"].apply(lambda row: row.month)
 
# Take a look at the converted and new month columns
print(volunteer[['start_date_converted', 'start_date_month']].head())

  start_date_converted  start_date_month
1           2011-02-01                 2
2           2011-01-29                 1
3           2011-02-14                 2
4           2011-02-05                 2
5           2011-02-12                 2


# 3.4 Text classification

$ P(A|B)= \frac{P(B|A)P(A)}{P(B} $

# Vectorizing text


*   tf = term frequency
*   idf = inverse documnent frequency 



**3.4.1 Engineering features from strings – extraction**


*   The Length column in the hiking dataset is a column of strings, but contained in the column is the mileage for the hike. We’re going to extract this mileage using regular expressions, and then use a lambda in Pandas to apply the extraction to the DataFrame.



In [None]:
import re

# Write a pattern to extract numbers and decimals
def return_mileage(length):
    pattern = re.compile(r"\d+\.\d+")
    
    # Search the text for matches
    mile = re.match(pattern, length)
    
    # If a value is returned, use group(0) to return the found value
    if mile is not None:
        return float(mile.group(0))
        
# Apply the function to the Length column and take a look at both columns
hiking["Length_num"] = hiking["Length"].apply(lambda row: return_mileage(row))
print(hiking[["Length", "Length_num"]].head())

       Length  Length_num
0   0.8 miles        0.80
1    1.0 mile        1.00
2  0.75 miles        0.75
3   0.5 miles        0.50
4   0.5 miles        0.50


**3.4.2 Engineering features from strings – tf/idf**


*   Let’s transform the volunteer dataset’s title column into a text vector, to use in a prediction task in the next exercise.



In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Take the title text
title_text = volunteer["title"]

# Create the vectorizer method
tfidf_vec = TfidfVectorizer()

# Transform the text into tf-idf vectors
text_tfidf = tfidf_vec.fit_transform(title_text)
text_tfidf.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:

text_tfidf.toarray().shape

(617, 1089)

**3.4.3 Text classification using tf/idf vectors**


*   Now that we’ve encoded the volunteer dataset’s title column into tf/idf vectors, let’s use those vectors to try to predict the category_desc column.



In [None]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
# Split the dataset according to the class distribution of category_desc
y = volunteer["category_desc"]
X_train, X_test, y_train, y_test = train_test_split(text_tfidf.toarray(), y, stratify=y)

# set GaussianNB to nb variables
nb=GaussianNB()
# Fit the model to the training data
nb.fit(X_train, y_train)

# Print out the model's accuracy
print(nb.score(X_test, y_test))

0.5548387096774193
