# Data Exploration

## 1. Setting Up Spark Context

In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

In [2]:
sc = SparkContext.getOrCreate(SparkConf().setMaster("local[*]"))

spark = SparkSession \
    .builder \
    .getOrCreate()

## 2. Download and Data from Kaggle

You need an API Token for Kaggle to work. Create one in your Kaggle account settings page and upload `kaggle.json`.

In [3]:
import os
import getpass

def get_or_set_environment_variable(variable):
    try:
        var = os.environ[variable]
    except KeyError:
        var = getpass.getpass('Please enter value for {:}: '.format(variable))
    
    os.environ[variable] = var
    return var

ibm_api_key_id = get_or_set_environment_variable('IBM_API_KEY_ID')
ibm_cloud_store_bucket = get_or_set_environment_variable('IBM_OBJECT_STORE_BUCKET')

Please enter value for IBM_API_KEY_ID: ········
Please enter value for IBM_OBJECT_STORE_BUCKET: ········


In [4]:
import json
import os

import types
from botocore.client import Config
import ibm_boto3

def __iter__(self): return 0

# @hidden_cell
# The following code accesses a file in your IBM Cloud Object Storage. It includes your credentials.
# You might want to remove those credentials before you share the notebook.
client = ibm_boto3.client(service_name='s3',
    ibm_api_key_id=ibm_api_key_id,
    ibm_auth_endpoint="https://iam.cloud.ibm.com/oidc/token",
    config=Config(signature_version='oauth'),
    endpoint_url='https://s3-api.us-geo.objectstorage.service.networklayer.com')

body = client.get_object(Bucket=ibm_cloud_store_bucket, Key='kaggle.json')['Body']
# add missing __iter__ method, so pandas accepts body as file-like object 

if not hasattr(body, "__iter__"): body.__iter__ = types.MethodType( __iter__, body )

creds = json.load(body)
os.environ['KAGGLE_USERNAME'] = creds['username']
os.environ['KAGGLE_KEY'] = creds['key']

In [5]:
!kaggle competitions list

ref                                            deadline             category            reward  teamCount  userHasEntered  
---------------------------------------------  -------------------  ---------------  ---------  ---------  --------------  
contradictory-my-dear-watson                   2030-07-01 23:59:00  Getting Started     Prizes         87           False  
gan-getting-started                            2030-07-01 23:59:00  Getting Started     Prizes        149           False  
tpu-getting-started                            2030-06-03 23:59:00  Getting Started  Knowledge        356           False  
digit-recognizer                               2030-01-01 00:00:00  Getting Started  Knowledge       2189           False  
titanic                                        2030-01-01 00:00:00  Getting Started  Knowledge      16994           False  
house-prices-advanced-regression-techniques    2030-01-01 00:00:00  Getting Started  Knowledge       4688           False  


In [6]:
!kaggle competitions download -c nlp-getting-started -p "./work"

Downloading nlp-getting-started.zip to ./work
  0%|                                                | 0.00/593k [00:00<?, ?B/s]
100%|████████████████████████████████████████| 593k/593k [00:00<00:00, 17.0MB/s]


In [7]:
import glob

work_path = os.path.join(os.path.curdir, 'work')

zip_path = glob.glob(os.path.join(work_path, '*.zip'))[0]
zip_path

'./work/nlp-getting-started.zip'

In [8]:
import zipfile

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(work_path)

In [9]:
glob.glob(os.path.join(work_path, '*.csv'))

['./work/sample_submission.csv', './work/test.csv', './work/train.csv']

In [10]:
df = spark.read.options(header=True,
                        inferschema=True,
                        multiLine=True,
                        encoding='UTF-8').csv(os.path.join(work_path, 'train.csv'))
df.limit(20).toPandas()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
5,8,,,#RockyFire Update => California Hwy. 20 closed...,1
6,10,,,#flood #disaster Heavy rain causes flash flood...,1
7,13,,,I'm on top of the hill and I can see a fire in...,1
8,14,,,There's an emergency evacuation happening now ...,1
9,15,,,I'm afraid that the tornado is coming to our a...,1


## 3. Exploring the data

In [11]:
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- keyword: string (nullable = true)
 |-- location: string (nullable = true)
 |-- text: string (nullable = true)
 |-- target: integer (nullable = true)



In [12]:
summary_statistics = df.describe()
summary_statistics.limit(20).toPandas()

Unnamed: 0,summary,id,keyword,location,text,target
0,count,7613.0,7552,5080,7613,7613.0
1,mean,5441.934848285827,,7727.791666666667,,0.4296597924602653
2,stddev,3137.1160896461192,,23681.099269393806,,0.4950600449341294
3,min,1.0,ablaze,,! Residents Return To Destroyed Homes As Washi...,0.0
4,max,10873.0,wrecked,åø\_(?)_/åø,åÈMGN-AFRICAå¨ pin:263789F4 åÈ Correction: Ten...,1.0


In [13]:
row_count = df.count()
row_count

7613

In [14]:
import pyspark.sql.functions as sfun
counts = df.select(sfun.count('id'),
                   sfun.count('keyword'),
                   sfun.count('location'),
                   sfun.count('text'),
                   sfun.count('target')).first()
per_filled = [cnt / float(row_count) for cnt in counts]
per_filled

[1.0, 0.9919873899908052, 0.6672796532247471, 1.0, 1.0]

In [15]:
import plotly.io as pio
pio.renderers.default = 'notebook_connected'

In [16]:
import plotly.express as px
import pandas as pd

fig = px.bar(pd.DataFrame({'column': df.columns, 'filled': per_filled}),
             x='column', y='filled',
             title='Percentage of filled rows per column',
             text=counts,
             labels={'text': 'count'}
            )
fig.update_yaxes(tickformat=".02%")
fig.show()

In [17]:
top10_keywords = df.select('keyword').groupby('keyword').count().sort(sfun.desc('count')).na.drop().limit(10)
top10_with_target = df.select('target', 'keyword').groupby(
    'target', 'keyword').count().join(top10_keywords.select('keyword', sfun.col('count').alias('total')), on='keyword', how='right')
top10_with_target_px = top10_with_target.select('keyword', 'count', sfun.col('target').cast('String')).withColumn(
    'percentage', sfun.col('count')/float(row_count)).sort(sfun.asc('target'), sfun.desc('total'))

fig = px.bar(top10_with_target_px.toPandas(),
             x='keyword', y='percentage',
             color='target',
             title='Top 10 Keywords as percentage of total rows',
             text='count'
            )
fig.update_yaxes(tickformat=".02%")
fig.show()

In [18]:
top10_locations = df.select('location').groupby('location').count().orderBy(sfun.desc('count')).na.drop().limit(10)
top10_with_target = df.select('target', 'location').groupby(
    'target', 'location').count().join(top10_locations.select('location', sfun.col('count').alias('total')), on='location', how='right')
top10_with_target_px = top10_with_target.select('location', 'count', sfun.col('target').cast('String')).withColumn(
    'percentage', sfun.col('count')/float(row_count)).sort(sfun.asc('target'), sfun.desc('total'))

fig = px.bar(top10_with_target_px.toPandas(),
             x='location', y='percentage',
             color='target',
             title='Top 10 Locations as percentage of total rows',
             text='count'
            )
fig.update_yaxes(tickformat=".02%")
fig.show()

In [19]:
tweets = df.select(sfun.col('target').cast('String')).groupby(
    'target').count().withColumn('column', sfun.lit('target')).sort('target').withColumn(
        'percentage', sfun.col('count')/row_count)

fig = px.bar(tweets.toPandas(),
             x='column', y='percentage',
             color='target',
             title='Target as percentage of total rows',
             text='count'
            )
fig.update_yaxes(tickformat=".02%")
fig.show()

In [20]:
df.where(sfun.col('target') == 0).select('text').rdd.map(lambda row: row['text']).take(15)

["What's up man?",
 'I love fruits',
 'Summer is lovely',
 'My car is so fast',
 'What a goooooooaaaaaal!!!!!!',
 'this is ridiculous....',
 'London is cool ;)',
 'Love skiing',
 'What a wonderful day!',
 'LOOOOOOL',
 "No way...I can't eat that shit",
 'Was in NYC last week!',
 'Love my girlfriend',
 'Cooool :)',
 'Do you like pasta?']

In [21]:
df.where(sfun.col('target') == 1).select('text').rdd.map(lambda row: row['text']).take(15)

['Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all',
 'Forest fire near La Ronge Sask. Canada',
 "All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected",
 '13,000 people receive #wildfires evacuation orders in California ',
 'Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school ',
 '#RockyFire Update => California Hwy. 20 closed in both directions due to Lake County fire - #CAfire #wildfires',
 '#flood #disaster Heavy rain causes flash flooding of streets in Manitou, Colorado Springs areas',
 "I'm on top of the hill and I can see a fire in the woods...",
 "There's an emergency evacuation happening now in the building across the street",
 "I'm afraid that the tornado is coming to our area...",
 'Three people died from the heat wave so far',
 'Haha South Tampa is getting flooded hah- WAIT A SECOND I LIVE IN SOUTH TAMPA WHAT AM I GONNA DO WHAT AM I GONNA 

### Word count per category

In [22]:
words = df.select(sfun.lower(sfun.col('text')).alias('text'), 'target').rdd.flatMap(
    lambda row: [(word, (row['target'], 1 - row['target'])) for word in row['text'].split()])
wordcount = words.reduceByKey(lambda agg, cat: (agg[0] + cat[0], agg[1] + cat[1]))
top20_words = wordcount.sortBy(keyfunc=lambda row: -sum(row[1])).take(20)
top20_words

[('the', (1330, 1877)),
 ('a', (897, 1238)),
 ('in', (1144, 805)),
 ('to', (753, 1181)),
 ('of', (919, 895)),
 ('and', (494, 911)),
 ('i', (284, 1052)),
 ('is', (344, 586)),
 ('for', (394, 486)),
 ('on', (410, 424)),
 ('-', (389, 374)),
 ('you', (121, 625)),
 ('my', (130, 541)),
 ('with', (188, 374)),
 ('that', (181, 357)),
 ('at', (303, 229)),
 ('by', (274, 240)),
 ('it', (163, 341)),
 ('this', (177, 286)),
 ('from', (243, 177))]

In [24]:
c_words, counts = zip(*top20_words)
c_words *= 2

count1, count0 = zip(*counts)
c_counts = count0 + count1

c_target = ['0'] * len(counts) + ['1'] * len(counts)

fig = px.bar(pd.DataFrame({'word': c_words,
                     'count': c_counts,
                     'target': c_target
                    }),
             x='word',
             y='count',
             title='Top 20 Words per Target',
             color='target')

fig.show()