In [1]:
!pip install scikit-learn==0.24.2

Collecting scikit-learn==0.24.2
[?25l  Downloading https://files.pythonhosted.org/packages/f5/ef/bcd79e8d59250d6e8478eb1290dc6e05be42b3be8a86e3954146adbc171a/scikit_learn-0.24.2-cp36-cp36m-manylinux1_x86_64.whl (20.0MB)
[K    100% |████████████████████████████████| 20.0MB 803kB/s eta 0:00:01  6% |██                              | 1.3MB 13.2MB/s eta 0:00:02    11% |███▉                            | 2.4MB 12.4MB/s eta 0:00:02    14% |████▊                           | 2.9MB 12.0MB/s eta 0:00:02    17% |█████▋                          | 3.5MB 12.8MB/s eta 0:00:02    28% |█████████▏                      | 5.7MB 12.3MB/s eta 0:00:02    31% |██████████                      | 6.2MB 12.7MB/s eta 0:00:02    33% |██████████▉                     | 6.8MB 17.1MB/s eta 0:00:01    36% |███████████▊                    | 7.3MB 11.1MB/s eta 0:00:02    39% |████████████▋                   | 7.8MB 11.9MB/s eta 0:00:02    41% |█████████████▍                  | 8.4MB 12.0MB/s eta 0:00:01    44% |██████████

In [2]:
import sklearn
print('The scikit-learn version is {}.'.format(sklearn.__version__))

The scikit-learn version is 0.24.2.


In [4]:
# import libraries
import pandas as pd
from sqlalchemy import create_engine
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from Custom_Transformers import MessageTransformer, MessageTransformer_2
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
# load data from database
engine = create_engine('sqlite:///Vasilis_db.db')
df = pd.read_sql_table('Emergency_Messages', engine)
df = df.drop(columns = ['id','original'], axis=1)

In [6]:
# drop rows with NaN messages
df = df.dropna(subset=['message'])

In [7]:
df = df.dropna()

In [8]:
df.loc[df.related==2, 'related'] = 1

In [9]:
X = df[['message','genre']]
Y = df.drop(columns=['message','genre'], axis=1)

In [10]:
def tokenize(text):
    """ Function that tokenizes and lemmatizes text

    :param text:     input text to be processed(str)
    :return:         cleaned_tokens(str)
    """

    stop_words = stopwords.words("english")
    lemmatizer = WordNetLemmatizer()

    # Text normalization
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())
    #tokenization
    words = word_tokenize (text)
    # lemmatization
    cleaned_tokens = [lemmatizer.lemmatize(word).strip() for word in words if word not in stop_words]

    return cleaned_tokens 

In [11]:
text_transformer = Pipeline([
    ('messagetransformer', MessageTransformer_2(tokenize)),
])
genre_transformer = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer([
    ('text', text_transformer, [0]),
    ('genre', genre_transformer, [1])
])
pipeline_2 = Pipeline([
    ('preprocess', preprocessor),
    ('clf', RandomForestClassifier(random_state=33))
])

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=33)

In [14]:
preprocessor.fit(X_train.iloc[:2000])

ColumnTransformer(transformers=[('text',
                                 Pipeline(steps=[('messagetransformer',
                                                  MessageTransformer_2(tokenize=<function tokenize at 0x7f8a309680d0>))]),
                                 [0]),
                                ('genre',
                                 Pipeline(steps=[('onehot',
                                                  OneHotEncoder(handle_unknown='ignore'))]),
                                 [1])])

In [15]:
transformed_test_1 = preprocessor.transform(X_train.iloc[:2000])
print(transformed_test_1.shape)
print(transformed_test_1)

(2000, 4)
[[0. 0. 1. 0.]
 [0. 0. 0. 1.]
 [0. 0. 1. 0.]
 ...
 [0. 1. 0. 0.]
 [1. 1. 0. 0.]
 [0. 0. 1. 0.]]


In [16]:
transformed_test_2 = preprocessor.transform(X_train.iloc[2000:8000])
print(transformed_test_2.shape)
print(transformed_test_2)

(6000, 4)
[[0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 ...
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]]
