In [2]:
import re

import pandas as pd
pd.options.display.max_rows = 6
pd.set_option('display.precision', 2)

from IPython.display import display, Markdown

import plotly.express as px
import plotly.io as pio
pio.templates.default = 'plotly_white'

In [3]:
try:
    pd.read_csv('data/vast challenge 2011/Microblogs.csv')
except FileNotFoundError:
    VASTOPOLIS_URL = 'https://drive.google.com/file/d/1Y5xWPRKk8D_TNYmZ0ny4cQ83shGM5YqI/view?usp=sharing'
    VASTOPOLIS_URL = 'https://drive.usercontent.google.com/download?id={}&export=download&authuser=0&confirm=t'.format(VASTOPOLIS_URL.split('/')[-2])

    Messages = pd.read_csv(VASTOPOLIS_URL, encoding='latin', on_bad_lines='skip')

Messages.Created_at = pd.to_datetime(Messages.Created_at, format='%m/%d/%Y %H:%M', errors='coerce')

Messages.dropna(inplace=True)  # Created_at でエラーを生じるデータを捨てる
Messages.sort_values(by=['Created_at', 'ID'], inplace=True)

Messages[['latitude', 'longitude']] = Messages.Location.str.split(' ', n=1, expand=True).astype('float')
Messages.drop('Location', axis='columns', inplace=True)

# Indexing the messages by the date/time of creation
Messages.index = Messages.Created_at

Messages

Unnamed: 0_level_0,ID,Created_at,text,latitude,longitude
Created_at,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2011-04-30 00:00:00,3779,2011-04-30 00:00:00,#dontyoujusthateitwhen people touch your hair!...,42.18,93.54
2011-04-30 00:00:00,4656,2011-04-30 00:00:00,Alle kleren van me ouder uit de klerenkast hal...,42.29,93.51
2011-04-30 00:00:00,20692,2011-04-30 00:00:00,Normally I'd be excited to go play 18 on a sat...,42.17,93.23
...,...,...,...,...,...
2011-05-20 23:59:00,168923,2011-05-20 23:59:00,Is just worn out,42.25,93.31
2011-05-20 23:59:00,171438,2011-05-20 23:59:00,the whole day has just been terrible I have co...,42.23,93.29
2011-05-20 23:59:00,172464,2011-05-20 23:59:00,this fever is making it tough to have a life a...,42.25,93.42


# Histogram of reported symptoms

In [5]:
Symptoms = set('fever chill sweat ache pain fatigue cough breath nausea vomit diarrhoea enlarged lymph'.split(' '))
hist = {}
for symptom in Symptoms:
    hist[symptom] = 0

for sentence in Messages.text:
    if pd.isna(sentence): continue
    for symptom in Symptoms:
        if sentence.find(symptom) > 0: hist[symptom] = hist[symptom] + 1

histogram = dict(sorted(list(hist.items()), key=lambda p: p[1], reverse=True))

In [94]:
print(histogram)
px.bar(x=histogram.keys(), y=histogram.values())

{'chill': 12237, 'ache': 10366, 'fever': 7796, 'breath': 6744, 'pain': 5667, 'sweat': 4260, 'cough': 3445, 'fatigue': 3430, 'nausea': 822, 'vomit': 648, 'lymph': 9, 'diarrhoea': 7, 'enlarged': 4}


In [7]:
# Inefficient but simpler solution
# The sum() method counts the number of "True" values

hist = [(symptom, Messages.text.str.contains(symptom).sum()) for symptom in 'fever chill sweat ache pain fatigue cough breath nausea vomit diarrhoea enlarged lymph'.split(' ')]
histogram = dict(sorted(hist, key=lambda p: p[1], reverse=True))
px.bar(x=histogram.keys(), y=histogram.values())

In [93]:
Symptoms = [symptom for symptom, _ in hist if not (symptom in ['lymph', 'diarrhoea', 'enlarged'])]
print(Symptoms)

['fever', 'chill', 'sweat', 'ache', 'pain', 'fatigue', 'cough', 'breath', 'nausea', 'vomit']


# Manipulation of time series data

Reference: [How to handle time series data with ease](https://pandas.pydata.org/docs/getting_started/intro_tutorials/09_timeseries.html)

In [9]:
Messages.Created_at.min(), Messages.Created_at.max()

(Timestamp('2011-04-30 00:00:00'), Timestamp('2011-05-20 23:59:00'))

## Messages sent on April 30

In [7]:
Messages.loc['2011-04-30']

Unnamed: 0_level_0,ID,Created_at,text,latitude,longitude
Created_at,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2011-04-30 00:00:00,3779,2011-04-30 00:00:00,#dontyoujusthateitwhen people touch your hair!...,42.18,93.54
2011-04-30 00:00:00,4656,2011-04-30 00:00:00,Alle kleren van me ouder uit de klerenkast hal...,42.29,93.51
2011-04-30 00:00:00,20692,2011-04-30 00:00:00,Normally I'd be excited to go play 18 on a sat...,42.17,93.23
...,...,...,...,...,...
2011-04-30 23:59:00,171502,2011-04-30 23:59:00,yeaaa && he was just PRACTICE . He aint in ya ...,42.22,93.34
2011-04-30 23:59:00,173651,2011-04-30 23:59:00,What a jolly nice chap Will Young is.,42.23,93.36
2011-04-30 23:59:00,177522,2011-04-30 23:59:00,I keep hearing fire trucks,42.17,93.55


In [14]:
display(Messages.iloc[0])
Messages.iloc[0].Created_at.month, Messages.iloc[0].Created_at.day, Messages.iloc[0].Created_at.hour

ID                                                         3779
Created_at                                  2011-04-30 00:00:00
text          #dontyoujusthateitwhen people touch your hair!...
latitude                                                  42.18
longitude                                                 93.54
Name: 2011-04-30 00:00:00, dtype: object

(4, 30, 0)

# Messages sent in the first week of May

In [8]:
display(Messages.loc['2011-05-01':'2011-05-07'])
len(Messages.loc['2011-05-01':'2011-05-07'])

Unnamed: 0_level_0,ID,Created_at,text,latitude,longitude
Created_at,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2011-05-01 00:00:00,4656,2011-05-01 00:00:00,I instantly felt about 10 tyms better. Sumtime...,42.23,93.35
2011-05-01 00:00:00,16024,2011-05-01 00:00:00,ARTS BRIEFLY; Iran in Brooklyn,42.22,93.34
2011-05-01 00:00:00,29873,2011-05-01 00:00:00,No Eminem that is not why they call it 'windo...,42.23,93.21
...,...,...,...,...,...
2011-05-07 23:59:00,154410,2011-05-07 23:59:00,I have to wake up in 5 1/2 hours. Someone is g...,42.21,93.36
2011-05-07 23:59:00,173651,2011-05-07 23:59:00,#WatchuWontDo is be my age and try to get with...,42.21,93.56
2011-05-07 23:59:00,180221,2011-05-07 23:59:00,It is 06:42 UTC now,42.21,93.36


317003

# Attributes of messages sent on the specified date

In [9]:
display(Markdown('## text'))
display(Messages.at['2011-04-30', 'text'])

display(Markdown('## latitude'))
display(Messages.at['2011-04-30', 'latitude'])

display(Markdown('## longitude'))
display(Messages.at['2011-04-30', 'longitude'])

## text

Created_at
2011-04-30 00:00:00    #dontyoujusthateitwhen people touch your hair!...
2011-04-30 00:00:00    Alle kleren van me ouder uit de klerenkast hal...
2011-04-30 00:00:00    Normally I'd be excited to go play 18 on a sat...
                                             ...                        
2011-04-30 23:59:00    yeaaa && he was just PRACTICE . He aint in ya ...
2011-04-30 23:59:00                What a jolly nice chap Will Young is.
2011-04-30 23:59:00                           I keep hearing fire trucks
Name: text, Length: 44322, dtype: object

## latitude

Created_at
2011-04-30 00:00:00    42.18
2011-04-30 00:00:00    42.29
2011-04-30 00:00:00    42.17
                       ...  
2011-04-30 23:59:00    42.22
2011-04-30 23:59:00    42.23
2011-04-30 23:59:00    42.17
Name: latitude, Length: 44322, dtype: float64

## longitude

Created_at
2011-04-30 00:00:00    93.54
2011-04-30 00:00:00    93.51
2011-04-30 00:00:00    93.23
                       ...  
2011-04-30 23:59:00    93.34
2011-04-30 23:59:00    93.36
2011-04-30 23:59:00    93.55
Name: longitude, Length: 44322, dtype: float64

# Messages sent between particular times in a day

In [10]:
Messages.loc['2011-04-30'].between_time('1:00', '2:00')

Unnamed: 0_level_0,ID,Created_at,text,latitude,longitude
Created_at,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2011-04-30 01:00:00,3116,2011-04-30 01:00:00,The Westfield Fashion Dectetor 'app' is still ...,42.21,93.40
2011-04-30 01:00:00,9964,2011-04-30 01:00:00,#BreakUpLines If we were to break up how would...,42.29,93.30
2011-04-30 01:00:00,11500,2011-04-30 01:00:00,My cast iron pan is like woah.,42.23,93.56
...,...,...,...,...,...
2011-04-30 02:00:00,155506,2011-04-30 02:00:00,Used my GPS for the first time it ended WITH ...,42.27,93.39
2011-04-30 02:00:00,161198,2011-04-30 02:00:00,#DontYouJustHateItWhen guys think they have game.,42.29,93.27
2011-04-30 02:00:00,163360,2011-04-30 02:00:00,According to the weather channel the high for ...,42.28,93.43


In [11]:
Messages.text.str.contains('chill').value_counts()

text
False    1010649
True       12237
Name: count, dtype: int64

# Grouping the dataset by date

In [75]:
messages_per_date = Messages.groupby(Messages.Created_at.dt.date)
messages_per_date

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x12dd88450>

In [76]:
num_messages_per_date = Messages.groupby(Messages.Created_at.dt.date).size()
num_messages_per_date

Created_at
2011-04-30    44322
2011-05-01    45245
2011-05-02    45072
              ...  
2011-05-18    61863
2011-05-19    71269
2011-05-20    69866
Length: 21, dtype: int64

In [77]:
px.line(x=num_messages_per_date.index, y=num_messages_per_date.values, labels={'x': 'Date', 'y': 'Number of messages'})

Honestly, I did not expect to see this boost in the number of messages starting from May 17.  The May 19 messages counts 71.3K which is near double of earlier days.

To compare the messages poster before/after May 17, the Y-range is adjusted below.

In [83]:
Range_MessageCounts = [0, num_messages_per_date.values.max() * 1.1]

px.line(x=num_messages_per_date.index, y=num_messages_per_date.values,
        range_y=Range_MessageCounts,
        labels={'x': 'Date', 'y': 'Number of messages'})

# Looking into May 15 - May 19 in per-hour

It may seem the `.groupby(...dt.hour)` technique work...

In [67]:
MessagesAfterMay16 = Messages.loc['2011-05-15':]
MessagesAfterMay16.loc['2011-05-15':].groupby(MessagesAfterMay16.Created_at.dt.hour).size()

Created_at
0     10052
1     12722
2     12720
      ...  
21    27260
22    19720
23    13637
Length: 24, dtype: int64

but this is not what we intended, because message are grouped by hours ignoring date.  What we want to do is 

In [68]:
Messages.loc['2011-05-15':].groupby(pd.Grouper(key='Created_at', freq='H')).size()

Created_at
2011-05-15 00:00:00    1555
2011-05-15 01:00:00    2092
2011-05-15 02:00:00    1749
                       ... 
2011-05-20 21:00:00    5020
2011-05-20 22:00:00    3828
2011-05-20 23:00:00    2774
Freq: H, Length: 144, dtype: int64

In [84]:
def _():
    messages = Messages.loc['2011-05-15':].groupby(pd.Grouper(key='Created_at', freq='H')).size()
    range_y = [0, messages.values.max() * 1.1]
    px.line(x=messages.index, y=messages.values,
            range_y=range_y,
            labels={'x': 'Date-Hour', 'y': 'Number of messages'}).show()

_()

This result is interesting. The data up to May 17th probably shows the normal daily patterns of the people of Vastopolis.
The spike at 9pm is probably because people tend to use social networking sites after dinner. It seems that the people of this city go to bed at 11pm.
From morning to evening, the level of activity on social networking sites gradually increases.
Apart from the fact that there is a higher volume of messages in the early morning on May 17, there are no noticeable changes. On May 18, in addition to the trends from the previous day, the volume of messages during the day has increased.
On May 19th and 20th, the volume of messages sent at night was double the usual amount. Something seems to be happening.

# 

# Symptom-related Message Counts

Up to this point, we have simply observed the number of messages without looking at the content of the messages.
Now, let's observe the number of messages that contain words related to the symptoms.

Up to this point, we have simply observed the number of messages without looking at the content of the messages.
Now, let's observe the number of messages that contain words related to the symptoms.

First, we will add an attribute that expresses whether or not each message contains various symptoms as a Boolean value. For example, if the `chill` attribute value of a certain message is `True`, it means that the message contains the word "chill". If the attribute value is `False`, it means that the message does not contain "chill".

In [85]:
Messages.text.str.contains('fever')

Created_at
2011-04-30 00:00:00    False
2011-04-30 00:00:00    False
2011-04-30 00:00:00    False
                       ...  
2011-05-20 23:59:00    False
2011-05-20 23:59:00    False
2011-05-20 23:59:00     True
Name: text, Length: 1022886, dtype: bool

In [95]:
Symptoms

['fever',
 'chill',
 'sweat',
 'ache',
 'pain',
 'fatigue',
 'cough',
 'breath',
 'nausea',
 'vomit']

In [103]:
for symptom in Symptoms:
    Messages[symptom] = Messages.text.str.contains(symptom)
Messages['symptoms'] = Messages[Symptoms].any(axis='columns')
Messages

Unnamed: 0_level_0,ID,Created_at,text,latitude,longitude,fever,chill,sweat,ache,pain,fatigue,cough,breath,nausea,vomit,symptoms
Created_at,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2011-04-30 00:00:00,3779,2011-04-30 00:00:00,#dontyoujusthateitwhen people touch your hair!...,42.18,93.54,False,False,False,False,False,False,False,False,False,False,False
2011-04-30 00:00:00,4656,2011-04-30 00:00:00,Alle kleren van me ouder uit de klerenkast hal...,42.29,93.51,False,False,False,False,False,False,False,False,False,False,False
2011-04-30 00:00:00,20692,2011-04-30 00:00:00,Normally I'd be excited to go play 18 on a sat...,42.17,93.23,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2011-05-20 23:59:00,168923,2011-05-20 23:59:00,Is just worn out,42.25,93.31,False,False,False,False,False,False,False,False,False,False,False
2011-05-20 23:59:00,171438,2011-05-20 23:59:00,the whole day has just been terrible I have co...,42.23,93.29,False,True,False,False,False,False,False,False,False,False,True
2011-05-20 23:59:00,172464,2011-05-20 23:59:00,this fever is making it tough to have a life a...,42.25,93.42,True,False,False,False,False,False,False,False,False,False,True


## Counts of messages including symptoms

In [104]:
def _():
    messages = Messages.query('symptoms == True').loc['2011-05-15':].groupby(pd.Grouper(key='Created_at', freq='H')).size()
    range_y = [0, messages.values.max() * 1.1]
    px.line(x=messages.index, y=messages.values,
            range_y=range_y,
            labels={'x': 'Date-Hour', 'y': 'Number of messages'}).show()

_()

In [155]:
def trend_of_symptom(symptom):
    messages = Messages.query(f'{symptom} == True').loc['2011-05-15':].groupby(pd.Grouper(key='Created_at', freq='H')).size()
    range_y = [0, messages.values.max() * 1.1]
    px.line(x=messages.index, y=messages.values,
            range_y=range_y,
            title=f'Trend of {symptom}',
            labels={'x': 'Date-Hour', 'y': 'Number of messages'}).show()

trend_of_symptom('symptoms')

# Trends of symptom types

Heare are figures of the trends of symptom types.  To save the space, only trends of selected symptom types are illustrated.

In [157]:
for symptom in  Symptoms[:3]:  # only the first three symptoms are shown
    trend_of_symptom(symptom)

len(Symptoms)

In [162]:
import plotly.graph_objects as go

def _(symptoms):
    messages = Messages.loc['2011-05-17':].query('symptoms == True')

    fig = go.Figure(layout=go.Layout(title='Trend of symptoms', xaxis={'title': 'Date-Hour'}, yaxis={'title': 'Symptoms'}, width=1600, height=1000))

    for symptom in Symptoms:
        counts = messages.query(f'{symptom} == True').groupby(pd.Grouper(key='Created_at', freq='H')).size()
        fig.add_trace(go.Scatter(x=counts.index, y=counts.values, mode='lines', name=symptom))

    fig.show()

_(Symptoms)

In [165]:
import plotly.graph_objects as go

def _(symptoms):
    messages = Messages.loc['2011-05-17':].query('symptoms == True')

    fig = go.Figure(layout=go.Layout(title='Trend of symptoms', legend_title_text='Symptoms', xaxis={'title': 'Date-Hour'}, yaxis={'title': 'Symptoms'}, width=1600, height=1000))

    for symptom in Symptoms:
        counts = messages.query(f'{symptom} == True').groupby(pd.Grouper(key='Created_at', freq='H')).size()
        fig.add_trace(go.Scatter(x=counts.index, y=counts.values / counts.values.max(), mode='lines', name=symptom))

    fig.show()

_(Symptoms)

Looking at the patterns for each symptom, it seems that there are groups of symptoms.

Let's observe the groups that increase in frequency during the daytime on May 18th. To do this, by toggling the legend, four types of symptoms (pain, cough, nausea, vomit) that are less frequent on this day will be hidden. As a result, fever, chill, sweat, ache, fatigue, breath will remain. While fever becomes more serious after May 19th, the other symptoms are on a decreasing trend.

From this analysis, we obtained two clusters of symptoms: {fever} and {chill, sweat, ache, fatigue, breath}. We will call them *Fever* and *Chill*:

Now let's display the symptoms hidden earlier: pain, cough, nausea, and vomit. From this result, it seems that nausea starts to decrease from May 20th, but other symptoms are getting worse. We introduce two more symptom types {pain,cough,vomit} and {nausea} and name them Pain and Nausea, respectively:

(Isn't it strange that the incidence of nausea is low, while vomiting is still high?  Also strange is that the trends of fever and chill are not aligned...)

In [166]:
ST_Fever = ['fever']
ST_Chill = 'chill, sweat, ache, fatigue, breath'.split(', ')
ST_Pain = 'pain, cough,v omit'.split(', ')
ST_Nausea = ['nausea']