# Data Cleansing and Feature Engineering

## 1. Setting Up Spark Context

In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

In [2]:
sc = SparkContext.getOrCreate(SparkConf().setMaster("local[*]"))

spark = SparkSession \
    .builder \
    .getOrCreate()

## 2. Download data from Object Store

In [3]:
import os
import getpass

def get_or_set_environment_variable(variable):
    try:
        var = os.environ[variable]
    except KeyError:
        var = getpass.getpass('Please enter value for {:}: '.format(variable))
    
    os.environ[variable] = var
    return var

ibm_api_key_id = get_or_set_environment_variable('IBM_API_KEY_ID')
ibm_cloud_store_bucket = get_or_set_environment_variable('IBM_OBJECT_STORE_BUCKET')

Please enter value for IBM_API_KEY_ID: ········
Please enter value for IBM_OBJECT_STORE_BUCKET: ········


In [4]:
import json
import os

import types
from botocore.client import Config
import ibm_boto3

def __iter__(self): return 0

client = ibm_boto3.client(service_name='s3',
    ibm_api_key_id=ibm_api_key_id,
    ibm_auth_endpoint="https://iam.cloud.ibm.com/oidc/token",
    config=Config(signature_version='oauth'),
    endpoint_url='https://s3-api.us-geo.objectstorage.service.networklayer.com')

body = client.get_object(Bucket=ibm_cloud_store_bucket,
                         Key='etl_parquet_files.json')['Body']
# add missing __iter__ method

if not hasattr(body, "__iter__"): body.__iter__ = types.MethodType( __iter__, body )

files = json.load(body)
files

{'train': ['disaster_detection_train-0000.parquet'],
 'label': ['disaster_detection_label-0000.parquet'],
 'test': ['disaster_detection_test-0000.parquet']}

In [5]:
def load_dataframe(files, **kargs):
    dfs = []
    for fn in files:
        body = client.get_object(Bucket=ibm_cloud_store_bucket,
                                 Key=fn)['Body']
        if not hasattr(body, "__iter__"):
            body.__iter__ = types.MethodType( __iter__, body )
        
        tfn = 'temp_{:}'.format(fn)
        with open(tfn, 'wb') as temp:
            temp.write(body.read())
        dfs.append(spark.read.options(**kargs).parquet(tfn))
    df = dfs.pop()
    for other in dfs:
        df = df.union(other)
    return df

df_train = load_dataframe(files['train'])
df_test = load_dataframe(files['test'])
df_label = load_dataframe(files['label'])

In [6]:
df_train.schema == df_test.schema

True

## 2. Data Cleansing

In [7]:
def drop_unused_cols(df):
    return df.drop('location', 'keyword')

df_train = drop_unused_cols(df_train)
df_test = drop_unused_cols(df_test)

df_train.limit(10).toPandas()

Unnamed: 0,id,text
0,1,Our Deeds are the Reason of this #earthquake M...
1,4,Forest fire near La Ronge Sask. Canada
2,5,All residents asked to 'shelter in place' are ...
3,6,"13,000 people receive #wildfires evacuation or..."
4,7,Just got sent this photo from Ruby #Alaska as ...
5,8,#RockyFire Update => California Hwy. 20 closed...
6,10,#flood #disaster Heavy rain causes flash flood...
7,13,I'm on top of the hill and I can see a fire in...
8,14,There's an emergency evacuation happening now ...
9,15,I'm afraid that the tornado is coming to our a...


In [8]:
!pip install unidecode

Collecting unidecode
  Downloading Unidecode-1.2.0-py2.py3-none-any.whl (241 kB)
[K     |████████████████████████████████| 241 kB 13.0 MB/s eta 0:00:01
[?25hInstalling collected packages: unidecode
Successfully installed unidecode-1.2.0


In [9]:
import re
import string

from unidecode import unidecode
import pyspark.sql.functions as sfun



def clean_text(text):    
    @sfun.udf
    def normalize_alphabet(text):
        text = unidecode(text)
        text = text.encode('ascii', errors='ignore').decode('utf-8', errors='ignore')
        text = text.lower()
        return text
    
    def no_squares(text):
        return sfun.regexp_replace(text, '\[.*?\]', '')
    
    def no_links(text):
        return sfun.regexp_replace(text, 'https?://\S+|www\.\S+', '')
    
    def no_angles(text):
        return sfun.regexp_replace(text, '<.*?>+', '')
    
    def no_punctuation(text):
        return sfun.regexp_replace(text, '[%s]' % re.escape(string.punctuation), '')
    
    def no_newline(text):
        return sfun.regexp_replace(text, '\n', '')
    
    def no_words_with_nums(text):
        return sfun.regexp_replace(text, '\w*\d\w*\s?', '')
    
    
    text = normalize_alphabet(text)
    text = no_squares(text)
    text = no_links(text)
    text = no_angles(text)
    text = no_punctuation(text)
    text = no_newline(text)
    text = no_words_with_nums(text)
    return text

df_train = df_train.withColumn('clean_text', clean_text('text'))
df_test = df_test.withColumn('clean_text', clean_text('text'))

df_train.limit(10).toPandas()

Unnamed: 0,id,text,clean_text
0,1,Our Deeds are the Reason of this #earthquake M...,our deeds are the reason of this earthquake ma...
1,4,Forest fire near La Ronge Sask. Canada,forest fire near la ronge sask canada
2,5,All residents asked to 'shelter in place' are ...,all residents asked to shelter in place are be...
3,6,"13,000 people receive #wildfires evacuation or...",people receive wildfires evacuation orders in ...
4,7,Just got sent this photo from Ruby #Alaska as ...,just got sent this photo from ruby alaska as s...
5,8,#RockyFire Update => California Hwy. 20 closed...,rockyfire update california hwy closed in bot...
6,10,#flood #disaster Heavy rain causes flash flood...,flood disaster heavy rain causes flash floodin...
7,13,I'm on top of the hill and I can see a fire in...,im on top of the hill and i can see a fire in ...
8,14,There's an emergency evacuation happening now ...,theres an emergency evacuation happening now i...
9,15,I'm afraid that the tornado is coming to our a...,im afraid that the tornado is coming to our area


In [10]:
df_train = df_train.select('id', 'text', sfun.split('clean_text', '\s+').alias('clean_text'))
df_test = df_test.select('id', 'text', sfun.split('clean_text', '\s+').alias('clean_text'))

df_train.limit(10).toPandas()

Unnamed: 0,id,text,clean_text
0,1,Our Deeds are the Reason of this #earthquake M...,"[our, deeds, are, the, reason, of, this, earth..."
1,4,Forest fire near La Ronge Sask. Canada,"[forest, fire, near, la, ronge, sask, canada]"
2,5,All residents asked to 'shelter in place' are ...,"[all, residents, asked, to, shelter, in, place..."
3,6,"13,000 people receive #wildfires evacuation or...","[people, receive, wildfires, evacuation, order..."
4,7,Just got sent this photo from Ruby #Alaska as ...,"[just, got, sent, this, photo, from, ruby, ala..."
5,8,#RockyFire Update => California Hwy. 20 closed...,"[rockyfire, update, california, hwy, closed, i..."
6,10,#flood #disaster Heavy rain causes flash flood...,"[flood, disaster, heavy, rain, causes, flash, ..."
7,13,I'm on top of the hill and I can see a fire in...,"[im, on, top, of, the, hill, and, i, can, see,..."
8,14,There's an emergency evacuation happening now ...,"[theres, an, emergency, evacuation, happening,..."
9,15,I'm afraid that the tornado is coming to our a...,"[im, afraid, that, the, tornado, is, coming, t..."


In [11]:
import re

import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop = stopwords.words('english') + ["I'm", 'via', 'u'] + [word + "'s" for word in stopwords.words('english')]

stop = list(map(lambda word: re.sub('[{:}]'.format(re.escape(string.punctuation)), '', word).lower(),
                stop))

stop[:10]

[nltk_data] Downloading package stopwords to /home/wsuser/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'youre']

In [12]:
@sfun.udf
def remove_stopwords(text):
    return list(filter(lambda word: not word in stop, text))

df_train = df_train.select('id', 'text', remove_stopwords('clean_text').alias('clean_text'))
df_test = df_test.select('id', 'text', remove_stopwords('clean_text').alias('clean_text'))

df_train.limit(10).toPandas()

Unnamed: 0,id,text,clean_text
0,1,Our Deeds are the Reason of this #earthquake M...,"[deeds, reason, earthquake, may, allah, forgiv..."
1,4,Forest fire near La Ronge Sask. Canada,"[forest, fire, near, la, ronge, sask, canada]"
2,5,All residents asked to 'shelter in place' are ...,"[residents, asked, shelter, place, notified, o..."
3,6,"13,000 people receive #wildfires evacuation or...","[people, receive, wildfires, evacuation, order..."
4,7,Just got sent this photo from Ruby #Alaska as ...,"[got, sent, photo, ruby, alaska, smoke, wildfi..."
5,8,#RockyFire Update => California Hwy. 20 closed...,"[rockyfire, update, california, hwy, closed, d..."
6,10,#flood #disaster Heavy rain causes flash flood...,"[flood, disaster, heavy, rain, causes, flash, ..."
7,13,I'm on top of the hill and I can see a fire in...,"[top, hill, see, fire, woods]"
8,14,There's an emergency evacuation happening now ...,"[emergency, evacuation, happening, building, a..."
9,15,I'm afraid that the tornado is coming to our a...,"[afraid, tornado, coming, area]"


In [13]:
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')

lemma = WordNetLemmatizer()

@sfun.udf
def lemmatize_words(words):
    def exhaustive_lemmatization(word):
        for pos in ['n', 'v', 'a']:
            lem = lemma.lemmatize(word, pos=pos)
            if lem != word:
                return lem
        return word
    return list(map(exhaustive_lemmatization, words))

df_train = df_train.select('id', 'text', lemmatize_words('clean_text').alias('clean_text'))
df_test = df_test.select('id', 'text', lemmatize_words('clean_text').alias('clean_text'))

df_train.limit(10).toPandas()

[nltk_data] Downloading package wordnet to /home/wsuser/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


Unnamed: 0,id,text,clean_text
0,1,Our Deeds are the Reason of this #earthquake M...,"[deed, reason, earthquake, may, allah, forgive..."
1,4,Forest fire near La Ronge Sask. Canada,"[forest, fire, near, la, ronge, sask, canada]"
2,5,All residents asked to 'shelter in place' are ...,"[resident, ask, shelter, place, notify, office..."
3,6,"13,000 people receive #wildfires evacuation or...","[people, receive, wildfire, evacuation, order,..."
4,7,Just got sent this photo from Ruby #Alaska as ...,"[get, send, photo, ruby, alaska, smoke, wildfi..."
5,8,#RockyFire Update => California Hwy. 20 closed...,"[rockyfire, update, california, hwy, close, di..."
6,10,#flood #disaster Heavy rain causes flash flood...,"[flood, disaster, heavy, rain, cause, flash, f..."
7,13,I'm on top of the hill and I can see a fire in...,"[top, hill, see, fire, wood]"
8,14,There's an emergency evacuation happening now ...,"[emergency, evacuation, happen, build, across,..."
9,15,I'm afraid that the tornado is coming to our a...,"[afraid, tornado, come, area]"


In [14]:
@sfun.udf
def join_words(words):
    return ' '.join(words)

df_train = df_train.select('id', 'text', join_words('clean_text').alias('clean_text'))
df_test = df_test.select('id', 'text', join_words('clean_text').alias('clean_text'))

df_train.limit(10).toPandas()

Unnamed: 0,id,text,clean_text
0,1,Our Deeds are the Reason of this #earthquake M...,deed reason earthquake may allah forgive u
1,4,Forest fire near La Ronge Sask. Canada,forest fire near la ronge sask canada
2,5,All residents asked to 'shelter in place' are ...,resident ask shelter place notify officer evac...
3,6,"13,000 people receive #wildfires evacuation or...",people receive wildfire evacuation order calif...
4,7,Just got sent this photo from Ruby #Alaska as ...,get send photo ruby alaska smoke wildfire pour...
5,8,#RockyFire Update => California Hwy. 20 closed...,rockyfire update california hwy close directio...
6,10,#flood #disaster Heavy rain causes flash flood...,flood disaster heavy rain cause flash flood st...
7,13,I'm on top of the hill and I can see a fire in...,top hill see fire wood
8,14,There's an emergency evacuation happening now ...,emergency evacuation happen build across street
9,15,I'm afraid that the tornado is coming to our a...,afraid tornado come area


In [15]:
df = df_train.join(df_label, on='id', how='inner')

words = df.rdd.flatMap(
    lambda row: [(word, (int(row['target']), 1 - int(row['target']))) for word in row['clean_text'].split()])
wordcount = words.reduceByKey(lambda agg, cat: (agg[0] + cat[0], agg[1] + cat[1]))
print(wordcount.count())
top20_words = wordcount.sortBy(keyfunc=lambda row: -sum(row[1])).take(20)
top20_words

14101


[('get', (118, 310)),
 ('like', (99, 292)),
 ('fire', (266, 89)),
 ('amp', (106, 192)),
 ('go', (81, 194)),
 ('new', (57, 172)),
 ('bomb', (171, 50)),
 ('one', (69, 134)),
 ('people', (106, 93)),
 ('say', (93, 105)),
 ('news', (132, 53)),
 ('burn', (86, 94)),
 ('kill', (156, 17)),
 ('make', (42, 129)),
 ('video', (71, 97)),
 ('flood', (120, 45)),
 ('u', (81, 81)),
 ('time', (58, 104)),
 ('crash', (110, 51)),
 ('come', (52, 108))]

In [16]:
import plotly.io as pio
pio.renderers.default = 'notebook_connected'

In [17]:
import pandas as pd
import plotly.express as px

c_words, counts = zip(*top20_words)
c_words *= 2

count1, count0 = zip(*counts)
c_counts = count0 + count1

c_target = ['0'] * len(counts) + ['1'] * len(counts)

fig = px.bar(pd.DataFrame({'word': c_words,
                     'count': c_counts,
                     'target': c_target
                    }),
             x='word',
             y='count',
             text='count',
             title='Top 20 Words per Target',
             color='target')

fig.show()

## 3. Feature engineering

We create 3 features from the data and will evaluate performance.

* `CountVectorizer`: Converts a text document to a sparse vector of token counts.
* `TF-IDF`: Short for ‘term frequency–inverse document frequency’, is a numerical statistic that is intended to reflect how important a word is. In short we count the occurences of the word per document and normalize the count by taking the overall frequency of the term into account.
* `Word2Vec`: A neural network model is trained to learn word associations from a large corpus of text.

In [18]:
from pyspark.ml.feature import RegexTokenizer, CountVectorizer, HashingTF, IDF, Word2Vec
from pyspark.ml import Pipeline

regex_tokenizer = RegexTokenizer(inputCol='clean_text', outputCol='words', pattern=r'\W')
counter = CountVectorizer(inputCol='words', outputCol='features_count', vocabSize=2500, minDF=5)
hashing_tf = HashingTF(inputCol='words', outputCol='raw_features', numFeatures=2500)
idf = IDF(inputCol='raw_features', outputCol='features_tfidf', minDocFreq=5)
word2vec = Word2Vec(inputCol='words', outputCol='features_w2v', maxSentenceLength=50)

pipeline = Pipeline(stages=[regex_tokenizer,
                            counter,
                            hashing_tf,
                            idf,
                            word2vec]).fit(df_train)

df_eng_train = pipeline.transform(df_train).select('id', 'text', 'features_count', 'features_tfidf', 'features_w2v')
df_eng_test = pipeline.transform(df_test).select('id', 'text', 'features_count', 'features_tfidf', 'features_w2v')

df_eng_train.show()

+---+--------------------+--------------------+--------------------+--------------------+
| id|                text|      features_count|      features_tfidf|        features_w2v|
+---+--------------------+--------------------+--------------------+--------------------+
|  1|Our Deeds are the...|(2266,[16,80,201,...|(2500,[26,166,336...|[-0.0027080011953...|
|  4|Forest fire near ...|(2266,[2,139,183,...|(2500,[191,974,20...|[-0.0017992729055...|
|  5|All residents ask...|(2266,[212,318,32...|(2500,[294,691,11...|[0.00301898798947...|
|  6|13,000 people rec...|(2266,[8,41,85,21...|(2500,[8,325,644,...|[-0.0085309529677...|
|  7|Just got sent thi...|(2266,[0,85,130,1...|(2500,[178,198,32...|[-0.0092202152849...|
|  8|#RockyFire Update...|(2266,[2,41,85,21...|(2500,[191,325,64...|[-0.0043081726810...|
| 10|#flood #disaster ...|(2266,[15,21,57,1...|(2500,[353,356,64...|[0.00218058816002...|
| 13|I'm on top of the...|(2266,[2,25,187,1...|(2500,[15,191,187...|[-0.0103795815724...|
| 14|There

## 4. Serializing the dataframes in *Parquet* format

In [19]:
!rm -r ./disaster_detection_*

In [20]:
import glob

temp_parquet_file = os.path.join(os.path.curdir,
                                 'disaster_detection_clean_{}')
df_eng_train.write.parquet(temp_parquet_file.format('train'), mode='overwrite')
df_eng_test.write.parquet(temp_parquet_file.format('test'), mode='overwrite')

glob.glob(temp_parquet_file.format('*'))

['./disaster_detection_clean_test', './disaster_detection_clean_train']

## 5. Uploading the files to object cloud

In [21]:
def upload_parquet(client, path):
    parts = glob.glob(os.path.join(path, '*.parquet'))
    parquets = ['{:s}-{:04d}.parquet'.format(os.path.split(path)[-1], i)
                for i in range(len(parts))]
    for part, parquet in zip(parts, parquets):
        with open(part, 'rb') as parquetF:
            client.put_object(Bucket=ibm_cloud_store_bucket,
                          Body=parquetF,
                          Key=parquet
                         )
    return parquets

client = ibm_boto3.client(service_name='s3',
    ibm_api_key_id=ibm_api_key_id,
    ibm_auth_endpoint="https://iam.cloud.ibm.com/oidc/token",
    config=Config(signature_version='oauth'),
    endpoint_url='https://s3-api.us-geo.objectstorage.service.networklayer.com')


parquets = {}
for dataset in ('train', 'test'):
    parquets[dataset] = upload_parquet(client, temp_parquet_file.format(dataset))

print(parquets)

{'train': ['disaster_detection_clean_train-0000.parquet'], 'test': ['disaster_detection_clean_test-0000.parquet']}


In [22]:
import json


parquets['label'] = files['label']
client.put_object(Bucket=ibm_cloud_store_bucket,
                  Body=json.dumps(parquets),
                  Key='feature_eng_parquet_files.json')

{'ResponseMetadata': {'RequestId': 'e7b44b6f-46d0-49b1-9ae1-0c04548a2065',
  'HostId': '',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'date': 'Sat, 06 Feb 2021 21:46:11 GMT',
   'x-clv-request-id': 'e7b44b6f-46d0-49b1-9ae1-0c04548a2065',
   'server': 'Cleversafe',
   'x-clv-s3-version': '2.5',
   'x-amz-request-id': 'e7b44b6f-46d0-49b1-9ae1-0c04548a2065',
   'etag': '"fa8e4e3acd022a663841a2bb8fc8265c"',
   'content-length': '0'},
  'RetryAttempts': 0},
 'ETag': '"fa8e4e3acd022a663841a2bb8fc8265c"'}