# ML Pipeline 
按照如下的指导要求，搭建你的机器学习管道。
### 1. 导入与加载
- 导入 Python 库
- 使用 [`read_sql_table`](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_table.html) 从数据库中加载数据集
- 定义特征变量X 和目标变量 Y

In [4]:
# import libraries
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import sqlite3
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [5]:
# load data from database
engine = create_engine('sqlite:///InsertDatabaseName.db')
df = pd.read_sql_table(table_name='InsertTableName',con=engine,index_col='id')
X = df['message']
Y = df[['related', 'request', 'offer',
       'aid_related', 'medical_help', 'medical_products', 'search_and_rescue',
       'security', 'military', 'child_alone', 'water', 'food', 'shelter',
       'clothing', 'money', 'missing_people', 'refugees', 'death', 'other_aid',
       'infrastructure_related', 'transport', 'buildings', 'electricity',
       'tools', 'hospitals', 'shops', 'aid_centers', 'other_infrastructure',
       'weather_related', 'floods', 'storm', 'fire', 'earthquake', 'cold',
       'other_weather', 'direct_report']]

### 2. 编写分词函数，开始处理文本

In [6]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = stopwords.words("english")
lemmatizer = WordNetLemmatizer()

def tokenize(text):
    text = re.sub(r"[^z-zA-Z0-9]"," " ,text.lower())    
    tokens = word_tokenize(text)    
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]    
    return tokens


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tendays\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tendays\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\tendays\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### 3. 创建机器学习管道 
这个机器学习管道应该接收 `message` 列作输入，输出分类结果，分类结果属于该数据集中的 36 个类。你会发现 [MultiOutputClassifier](http://scikit-learn.org/stable/modules/generated/sklearn.multioutput.MultiOutputClassifier.html) 在预测多目标变量时很有用。

In [7]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.multioutput import MultiOutputClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.tree import DecisionTreeClassifier

class StartingVerbExtractor(BaseEstimator, TransformerMixin):

    def starting_verb(self, text):
        sentence_list = nltk.sent_tokenize(text)
        for sentence in sentence_list:
            pos_tags = nltk.pos_tag(tokenize(sentence))
            first_word, first_tag = pos_tags[0]
            if first_tag in ['VB', 'VBP'] or first_word == 'RT':
                return True
        return False

    def fit(self, x, y=None):
        return self

    def transform(self, X):
        X_tagged = pd.Series(X).apply(self.starting_verb)
        return pd.DataFrame(X_tagged)


pipeline = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', MultiOutputClassifier( DecisionTreeClassifier(random_state =42), n_jobs = -1))
         ])

### 4. 训练管道
- 将数据分割成训练和测试集
- 训练管道

In [8]:
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split 


X_train,X_test ,Y_train ,Y_test =train_test_split(X,Y)

pipeline.fit(X_train ,Y_train)
Y_pred=pipeline.predict(X_test)

Y_test.columns

Y_pred=pd.DataFrame(Y_pred,columns=['related', 'request', 'offer', 'aid_related', 'medical_help',
       'medical_products', 'search_and_rescue', 'security', 'military',
       'child_alone', 'water', 'food', 'shelter', 'clothing', 'money',
       'missing_people', 'refugees', 'death', 'other_aid',
       'infrastructure_related', 'transport', 'buildings', 'electricity',
       'tools', 'hospitals', 'shops', 'aid_centers', 'other_infrastructure',
       'weather_related', 'floods', 'storm', 'fire', 'earthquake', 'cold',
       'other_weather', 'direct_report']).astype('int')
Y_pred['related'].head()
Y_test=pd.DataFrame(Y_test).astype('int')

for i in Y_test.columns:
    print(Y_test[i].head())


id
10084    1
2269     1
22619    1
19207    1
18112    1
Name: related, dtype: int32
id
10084    0
2269     1
22619    0
19207    0
18112    0
Name: request, dtype: int32
id
10084    0
2269     0
22619    0
19207    0
18112    0
Name: offer, dtype: int32
id
10084    0
2269     1
22619    0
19207    0
18112    1
Name: aid_related, dtype: int32
id
10084    0
2269     0
22619    0
19207    0
18112    0
Name: medical_help, dtype: int32
id
10084    0
2269     0
22619    0
19207    0
18112    0
Name: medical_products, dtype: int32
id
10084    0
2269     0
22619    0
19207    0
18112    0
Name: search_and_rescue, dtype: int32
id
10084    0
2269     0
22619    0
19207    0
18112    0
Name: security, dtype: int32
id
10084    0
2269     0
22619    0
19207    0
18112    0
Name: military, dtype: int32
id
10084    0
2269     0
22619    0
19207    0
18112    0
Name: child_alone, dtype: int32
id
10084    0
2269     1
22619    0
19207    0
18112    0
Name: water, dtype: int32
id
10084    0
2269     1

### 5. 测试模型
报告数据集中每个输出类别的 f1 得分、准确度和召回率。你可以对列进行遍历，并对每个元素调用 sklearn 的 `classification_report`。

In [9]:
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix

for i in Y_test.columns:
    print(i)
    print('f1_scroe:',f1_score(Y_test[i], Y_pred[i], average='macro'))
    print('precision_score:',precision_score(Y_test[i], Y_pred[i], average="macro"))
    print('recall_score:',recall_score(Y_test[i], Y_pred[i], average="macro"),'\n')

related
f1_scroe: 0.30460787313301513
precision_score: 0.3486612350139884
recall_score: 0.3356308736275691 

request
f1_scroe: 0.4761998421531879
precision_score: 0.5704669590582585
recall_score: 0.5072673411020908 

offer
f1_scroe: 0.4987763842153564
precision_score: 0.49786259541984734
recall_score: 0.4996935335580754 

aid_related
f1_scroe: 0.4928291172131446
precision_score: 0.5943255727340443
recall_score: 0.5398720774408576 

medical_help
f1_scroe: 0.4870827364527284
precision_score: 0.5037712446783548
recall_score: 0.5005368744458263 

medical_products
f1_scroe: 0.5072770885067814
precision_score: 0.5505591002205086
recall_score: 0.5090944167993117 

search_and_rescue
f1_scroe: 0.5004703560820016
precision_score: 0.5176942135273201
recall_score: 0.5029291758356911 

security
f1_scroe: 0.5035763609233651
precision_score: 0.5415189486552567
recall_score: 0.5037657448884553 

military
f1_scroe: 0.5019926373327108
precision_score: 0.5403056655354449
recall_score: 0.5047032698072216 

### 6. 优化模型
使用网格搜索来找到最优的参数组合。 

In [13]:
from sklearn.model_selection import GridSearchCV

parameters = {
        'vect__ngram_range': ((1, 1), (1, 2)),
        'vect__max_df': (0.5, 0.75, 1.0),
        'vect__max_features': (None, 5000, 10000),
        'tfidf__use_idf': (True, False)}

cv = GridSearchCV(pipeline, parameters)

SyntaxError: invalid syntax (<ipython-input-13-674e9fdd6354>, line 8)

### 7. 测试模型
打印微调后的模型的精确度、准确率和召回率。  

因为本项目主要关注代码质量、开发流程和管道技术，所有没有模型性能指标的最低要求。但是，微调模型提高精确度、准确率和召回率可以让你的项目脱颖而出——特别是让你的简历更出彩。

In [None]:
fit=cv.fit(X_train, Y_train)
print("\nBest Parameters:", fit.best_params_)
Y_pred=fit.best_estimator_.predict(X_test)

Y_pred=pd.DataFrame(Y_pred,columns=['related', 'request', 'offer', 'aid_related', 'medical_help',
       'medical_products', 'search_and_rescue', 'security', 'military',
       'child_alone', 'water', 'food', 'shelter', 'clothing', 'money',
       'missing_people', 'refugees', 'death', 'other_aid',
       'infrastructure_related', 'transport', 'buildings', 'electricity',
       'tools', 'hospitals', 'shops', 'aid_centers', 'other_infrastructure',
       'weather_related', 'floods', 'storm', 'fire', 'earthquake', 'cold',
       'other_weather', 'direct_report']).astype('int')
Y_test=pd.DataFrame(Y_test).astype('int')
for i in Y_test.columns:
    print(i)
    print('f1_scroe:',f1_score(Y_test[i], Y_pred[i], average='macro'))
    print('precision_score:',precision_score(Y_test[i], Y_pred[i], average="macro"))
    print('recall_score:',recall_score(Y_test[i], Y_pred[i], average="macro"),'\n')

### 8. 继续优化模型，比如：
* 尝试其他的机器学习算法
* 尝试除 TF-IDF 外其他的特征

In [None]:
from sklearn.ensemble import RandomForestClassifier
import nltk
nltk.download('averaged_perceptron_tagger')

def tokenize(text):
    text = re.sub(r"[^z-zA-Z0-9]"," " ,text.lower())
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return tokens



def TextLengthExtractor(text):
    txt_length = text.apply(len)
    return txt_length
    
pipeline_fix = Pipeline([
        ('features', FeatureUnion([
            
            ('text_pipeline', Pipeline([
                ('vect', CountVectorizer(tokenizer=tokenize)),
                ('tfidf', TfidfTransformer())
            ]))


        ])),
        ('clf', RandomForestClassifier())
    ])

pipeline_fix.fit(X_train,Y_train)
Y_pred=pipeline_fix.predict(X_test)
Y_pred=pd.DataFrame(Y_pred,columns=['related', 'request', 'offer', 'aid_related', 'medical_help',
       'medical_products', 'search_and_rescue', 'security', 'military',
       'child_alone', 'water', 'food', 'shelter', 'clothing', 'money',
       'missing_people', 'refugees', 'death', 'other_aid',
       'infrastructure_related', 'transport', 'buildings', 'electricity',
       'tools', 'hospitals', 'shops', 'aid_centers', 'other_infrastructure',
       'weather_related', 'floods', 'storm', 'fire', 'earthquake', 'cold',
       'other_weather', 'direct_report']).astype('float')

Y_test=Y_test.astype('float')

for i in Y_test.columns:
    print(i)
    print('f1_scroe:',f1_score(Y_test[i], Y_pred[i], average='macro'))
    print('precision_score:',precision_score(Y_test[i], Y_pred[i], average="macro"))
    print('recall_score:',recall_score(Y_test[i], Y_pred[i], average="macro"),'\n')

### 9. 导出模型为 pickle file

In [None]:
import pickle 
from sklearn.externals import joblib
from sklearn.svm import SVC
from sklearn import datasets



#1.保存成Python支持的文件格式Pickle

#在当前目录下可以看到svm.pickle
with open('pipeline_fix.pickle','wb') as fw:
    pickle.dump(pipeline_fix,fw)
#加载svm.pickle
with open('pipeline_fix.pickle','rb') as fr:
    new_pipeline_fix1 = pickle.load(fr)

### 10. Use this notebook to complete `train.py`
使用资源 (Resources)文件里附带的模板文件编写脚本，运行上述步骤，创建一个数据库，并基于用户指定的新数据集输出一个模型。