# ML Pipeline 
按照如下的指导要求，搭建你的机器学习管道。
### 1. 导入与加载
- 导入 Python 库
- 使用 [`read_sql_table`](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_table.html) 从数据库中加载数据集
- 定义特征变量X 和目标变量 Y

In [15]:
# import libraries
import re
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import shuffle
from sklearn.metrics import classification_report

In [3]:
import nltk
#nltk.download(['punkt', 'wordnet', 'averaged_perceptron_tagger','stopwords'])

In [4]:
# load data from database
engine = create_engine('sqlite:///DisasterDB.db')
df = pd.read_sql("SELECT * FROM messaes", engine)
X =  df.message
y =  df.drop(['id','message','original','genre'],axis=1)
y.head()

Unnamed: 0,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,child_alone,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,1,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
y.describe()

Unnamed: 0,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,child_alone,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
count,26216.0,26216.0,26216.0,26216.0,26216.0,26216.0,26216.0,26216.0,26216.0,26216.0,...,26216.0,26216.0,26216.0,26216.0,26216.0,26216.0,26216.0,26216.0,26216.0,26216.0
mean,0.77365,0.170659,0.004501,0.414251,0.079493,0.050084,0.027617,0.017966,0.032804,0.0,...,0.011787,0.043904,0.278341,0.082202,0.093187,0.010757,0.093645,0.020217,0.052487,0.193584
std,0.435276,0.376218,0.06694,0.492602,0.270513,0.218122,0.163875,0.132831,0.178128,0.0,...,0.107927,0.204887,0.448191,0.274677,0.2907,0.103158,0.29134,0.140743,0.223011,0.395114
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [6]:
y.loc[y['related'] == 2] = 1

In [7]:
y[y['related'] == 2]

Unnamed: 0,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,child_alone,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report


### 2. 编写分词函数，开始处理文本

In [8]:
def tokenize(text):
    #1 Normalize text
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())
    #2 Tokenize text
    words = word_tokenize(text)
    #3 Remove stop words
    words = [w for w in words if w not in stopwords.words("english")]
    #4 # Lemmatize verbs by specifying pos
    lemmatizer = WordNetLemmatizer()
    stemmed = [ lemmatizer.lemmatize(w) for w in words]
    return stemmed

In [9]:
print(tokenize('We are more than 50 peoples on the street,Please help us find tent,food.'))

['50', 'people', 'street', 'please', 'help', 'u', 'find', 'tent', 'food']


In [10]:
class StartingVerbExtractor(BaseEstimator, TransformerMixin):

    def starting_verb(self, text):
        sentence_list = nltk.sent_tokenize(text)
        for sentence in sentence_list:
            pos_tags = nltk.pos_tag(tokenize(sentence))
            first_word, first_tag = pos_tags[0]
            if first_tag in ['VB', 'VBP'] or first_word == 'RT':
                return True
        return False

    def fit(self, x, y=None):
        return self

    def transform(self, X):
        X_tagged = pd.Series(X).apply(self.starting_verb)
        return pd.DataFrame(X_tagged)

### 3. 创建机器学习管道 
这个机器学习管道应该接收 `message` 列作输入，输出分类结果，分类结果属于该数据集中的 36 个类。你会发现 [MultiOutputClassifier](http://scikit-learn.org/stable/modules/generated/sklearn.multioutput.MultiOutputClassifier.html) 在预测多目标变量时很有用。

In [19]:
pipeline = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', MultiOutputClassifier(RandomForestClassifier()))
    ])

### 4. 训练管道
- 将数据分割成训练和测试集
- 训练管道

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y)
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print(y_pred)

[[1 0 0 ... 0 0 0]
 [1 1 1 ... 1 1 1]
 [1 0 0 ... 0 0 0]
 ...
 [1 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]]


In [13]:
print(y_pred.shape)
print(y_pred[1])

(6554, 36)
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1]


### 5. 测试模型
报告数据集中每个输出类别的 f1 得分、准确度和召回率。你可以对列进行遍历，并对每个元素调用 sklearn 的 `classification_report`。

In [9]:
from sklearn.metrics import classification_report

In [14]:
print(y_test.keys())

Index(['related', 'request', 'offer', 'aid_related', 'medical_help',
       'medical_products', 'search_and_rescue', 'security', 'military',
       'child_alone', 'water', 'food', 'shelter', 'clothing', 'money',
       'missing_people', 'refugees', 'death', 'other_aid',
       'infrastructure_related', 'transport', 'buildings', 'electricity',
       'tools', 'hospitals', 'shops', 'aid_centers', 'other_infrastructure',
       'weather_related', 'floods', 'storm', 'fire', 'earthquake', 'cold',
       'other_weather', 'direct_report'],
      dtype='object')


In [36]:
print(y_test.values)
print(y_pred)

[[1 0 0 ... 0 0 0]
 [1 1 0 ... 0 0 1]
 [1 1 0 ... 0 1 1]
 ...
 [1 1 0 ... 0 0 1]
 [0 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]]
[[1 0 0 ... 0 0 0]
 [1 1 0 ... 0 0 1]
 [1 0 0 ... 0 0 0]
 ...
 [1 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]]


In [17]:
from sklearn.metrics import classification_report
category_names = y.columns.values
print(classification_report(y_test, y_pred, target_names=category_names))
#results = pd.DataFrame(columns=['Category', 'f_score', 'precision', 'recall'])

                        precision    recall  f1-score   support

               related       0.85      0.94      0.90      5061
               request       0.79      0.52      0.63      1203
                 offer       0.35      0.24      0.28        79
           aid_related       0.76      0.69      0.72      2817
          medical_help       0.57      0.17      0.26       591
      medical_products       0.70      0.20      0.31       394
     search_and_rescue       0.52      0.20      0.29       210
              security       0.39      0.10      0.16       163
              military       0.47      0.16      0.24       266
           child_alone       0.38      0.38      0.38        47
                 water       0.79      0.51      0.62       510
                  food       0.82      0.65      0.73       775
               shelter       0.77      0.45      0.57       610
              clothing       0.62      0.31      0.41       153
                 money       0.48      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### 6. 优化模型
使用网格搜索来找到最优的参数组合。 

In [20]:
parameters =  {
    'clf__estimator__n_estimators': [50, 100],
    'clf__estimator__min_samples_split': [2, 4],
    'clf__estimator__criterion': ['entropy']
}


# create grid search object
cv =  GridSearchCV(pipeline, param_grid=parameters,cv=2, n_jobs=-1, verbose=3)

### 7. 测试模型
打印微调后的模型的精确度、准确率和召回率。  

因为本项目主要关注代码质量、开发流程和管道技术，所有没有模型性能指标的最低要求。但是，微调模型提高精确度、准确率和召回率可以让你的项目脱颖而出——特别是让你的简历更出彩。

In [None]:
cv.fit(X_train, y_train)

Fitting 2 folds for each of 4 candidates, totalling 8 fits
[CV] clf__estimator__criterion=entropy, clf__estimator__min_samples_split=2, clf__estimator__n_estimators=50 


[Parallel(n_jobs=-1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  clf__estimator__criterion=entropy, clf__estimator__min_samples_split=2, clf__estimator__n_estimators=50, score=0.253, total= 3.1min
[CV] clf__estimator__criterion=entropy, clf__estimator__min_samples_split=2, clf__estimator__n_estimators=50 


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:  3.1min remaining:    0.0s


[CV]  clf__estimator__criterion=entropy, clf__estimator__min_samples_split=2, clf__estimator__n_estimators=50, score=0.241, total= 3.2min
[CV] clf__estimator__criterion=entropy, clf__estimator__min_samples_split=2, clf__estimator__n_estimators=100 


[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  6.2min remaining:    0.0s


[CV]  clf__estimator__criterion=entropy, clf__estimator__min_samples_split=2, clf__estimator__n_estimators=100, score=0.250, total= 5.2min
[CV] clf__estimator__criterion=entropy, clf__estimator__min_samples_split=2, clf__estimator__n_estimators=100 


In [1]:
y_pred = cv.predict(X_test)

NameError: name 'cv' is not defined

### 8. 继续优化模型，比如：
* 尝试其他的机器学习算法
* 尝试除 TF-IDF 外其他的特征

In [None]:
category_names = y.columns.values
print(classification_report(y_test, y_pred, target_names=category_names))

### 9. 导出模型为 pickle file

In [None]:
output = open('data.pkl', 'wb')
s = pickle.dump(cv, output)
output.close()

### 10. Use this notebook to complete `train.py`
使用资源 (Resources)文件里附带的模板文件编写脚本，运行上述步骤，创建一个数据库，并基于用户指定的新数据集输出一个模型。