In [3]:
import os
import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import sys
sys.path.append("../scripts/twitter")
from searchTwitter import TwitterDataFrame
import utils as ut

# ML imports
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor

# Set up sns
sns.set(rc = {'figure.figsize':(10,10)})
sns.set_style("whitegrid")

In [4]:
# Reads files in a directory as csv and returns dataframe
def concat_data(base_dir):
    files = os.listdir(base_dir)
    dfs = []
    for f in files:
        dfs.append(pd.read_csv(base_dir + f))
        
    return pd.concat(dfs)

health_df = concat_data('../data/san_francisco/')
health_df.head()

  dfs.append(pd.read_csv(base_dir + f))


Unnamed: 0,id,author_id,text,geo,created_at,lat,lon
0,968999128641323008,25624940,@Volker_E Am I hallucinating that you are walk...,{'place_id': '5a110d312052166f'},2018-02-28 23:59:30,37.708075,-122.514926
1,968999100757680128,8888,There’s just something weird about living in N...,{'place_id': '5ef5b7f391e30aff'},2018-02-28 23:59:23,37.845953,-122.324818
2,968999065273774080,1557223812,Tomorrow’s March sheeeesh I’m basically 24 😕 t...,{'place_id': '5ecbd073f39c00fa'},2018-02-28 23:59:14,37.592632,-122.160814
3,968999056537088000,1164993320,Wednesday really be draining 6-6 school day 😴,{'place_id': '5ecbd073f39c00fa'},2018-02-28 23:59:12,37.592632,-122.160814
4,968999038493245440,18650764,@remedy415 @Brycesavoy510 This hella dope,{'place_id': 'ab2f2fac83aa388d'},2018-02-28 23:59:08,37.699279,-122.34266


In [5]:
# Turn into a TwitterDataFrame and zoom in on san francisco
health_df = TwitterDataFrame(health_df)
lat = (37, 39)
lon = (-121, -124)
sf_df_raw = health_df.zoom_in(lat, lon)

In [6]:
# Take a random subsample of the dataframe to reduce the size
sf_df_raw = sf_df_raw.sample(50000, replace=True)

# Ideally this should be before sampling but this way saves time
# Could this happen in TwitterDataFrame?
sf_df_raw['created_at'] = [datetime.strptime(d,'%Y-%m-%d %H:%M:%S') for d in sf_df_raw['created_at']]

# Subset 5/2018-12/2018
sf_df_raw = sf_df_raw[sf_df_raw['created_at'] > datetime(2018, 4, 30)]

# Change date quantum to day
sf_df_raw['date'] = sf_df_raw['created_at'].dt.floor('d')

sf_df = TwitterDataFrame(sf_df_raw)
sf_df.head()

Unnamed: 0,id,author_id,text,geo,created_at,lat,lon,date
82172,1075097271799242752,2203747617,That flatbread beat so smooth 🤗,{'place_id': '5ef5b7f391e30aff'},2018-12-18 18:35:38,37.845953,-122.324818,2018-12-18
182883,1059216766595780615,2480034637,@PizzaToThePolls @SuttaCBSMiami @CBSMiami Bles...,{'place_id': '71d33f776fe41dfb'},2018-11-04 22:52:11,37.954027,-122.157021,2018-11-04
148645,995488374952783872,17527081,I think I have successfully gamed the Virgin A...,{'place_id': '272f29aa61fa05d3'},2018-05-13 02:18:18,37.581956,-122.425557,2018-05-13
3324,1068533961192689664,1058831,@gangsterhealth @CheriJacobus @xtrixcyclex @hi...,{'place_id': 'ab2f2fac83aa388d'},2018-11-30 15:55:23,37.699279,-122.34266,2018-11-30
95066,1008791745054662659,946393785335869440,@TammyJLemley My cat does that,{'place_id': '000e96b4e9f8503f'},2018-06-18 19:21:09,37.720367,-122.332411,2018-06-18


In [None]:
# What are the basics of this data set?
print("Lenth: ", len(sf_df))
print("Per day :", len(sf_df)/365)

In [None]:
# Plot num tweets per day by date
plt.figure(figsize=(8,5))
plt.plot(sf_df.count_by_day())
ax = plt.gca()
ax.set_ylabel("Tweets / day")
ax.set_xlabel("Date")

In [None]:
import cartopy
import cartopy.feature as cfeature
fig = plt.figure(figsize=(20,10))
ax = fig.add_axes([0, 0, 1, 1], projection=cartopy.crs.LambertConformal(central_longitude=-98.0))
ax.set_extent((-120, -75,21, 50), cartopy.crs.Geodetic())
ax.coastlines()

# Add state boundaries
states_provinces = cfeature.NaturalEarthFeature(
        category='cultural',
        name='admin_1_states_provinces_lines',
        scale='50m',
        facecolor='none')

ax.add_feature(states_provinces, edgecolor='gray')
ax.add_feature(cfeature.BORDERS)
ax.scatter(sf_df['lon'],sf_df['lat'], transform=cartopy.crs.PlateCarree())
# plt.show()

In [7]:
# Read in the daily AQI in 2018 California
aqi_2018 = pd.read_csv('../data/daily_aqi_by_county_2018.csv')

# Subset just SF country and extract dates
sf_aqi = aqi_2018[(aqi_2018['State Name']=='California') & (aqi_2018['county Name']=='San Francisco')]
sf_aqi['Date'] = pd.to_datetime(sf_aqi['Date'])

sf_aqi.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sf_aqi['Date'] = pd.to_datetime(sf_aqi['Date'])


Unnamed: 0,State Name,county Name,State Code,County Code,Date,AQI,Category,Defining Parameter,Defining Site,Number of Sites Reporting
27443,California,San Francisco,6,75,2018-01-01,72,Moderate,PM2.5,06-075-0005,1
27444,California,San Francisco,6,75,2018-01-02,82,Moderate,PM2.5,06-075-0005,1
27445,California,San Francisco,6,75,2018-01-03,111,Unhealthy for Sensitive Groups,PM2.5,06-075-0005,1
27446,California,San Francisco,6,75,2018-01-04,58,Moderate,PM2.5,06-075-0005,1
27447,California,San Francisco,6,75,2018-01-05,26,Good,NO2,06-075-0005,1


</br>
</br>
</br>

## Experiment 1: Predicting AQI with individual tweets
----

In [8]:
# Merge AQI and tweet dateframes
sf_mrg = pd.merge(left=sf_df, right=sf_aqi, left_on='date', right_on='Date', how='inner')

# extract our regression variables
X = sf_mrg['text']
y = sf_mrg['AQI']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=23)

sf_mrg.head()

Unnamed: 0,id,author_id,text,geo,created_at,lat,lon,date,State Name,county Name,State Code,County Code,Date,AQI,Category,Defining Parameter,Defining Site,Number of Sites Reporting
0,1075097271799242752,2203747617,That flatbread beat so smooth 🤗,{'place_id': '5ef5b7f391e30aff'},2018-12-18 18:35:38,37.845953,-122.324818,2018-12-18,California,San Francisco,6,75,2018-12-18,36,Good,PM2.5,06-075-0005,1
1,1074864716135116801,1382907001,@KamalaHarris College is not affordable! How d...,{'place_id': '5a110d312052166f'},2018-12-18 03:11:33,37.708075,-122.514926,2018-12-18,California,San Francisco,6,75,2018-12-18,36,Good,PM2.5,06-075-0005,1
2,1074845471997255680,723963253391400960,Peeling my shorts off my ass like a layer of d...,{'place_id': '322ed6e7cc8ff243'},2018-12-18 01:55:04,37.942618,-122.592422,2018-12-18,California,San Francisco,6,75,2018-12-18,36,Good,PM2.5,06-075-0005,1
3,1074951773440204800,2746596348,@ChalieHustle510 Thankyouuu cuddy❤️❤️💯🙌🏾,{'place_id': '712d61ba26321517'},2018-12-18 08:57:29,38.014234,-122.274848,2018-12-18,California,San Francisco,6,75,2018-12-18,36,Good,PM2.5,06-075-0005,1
4,1074822970181206016,700539414556979200,"@connectwithAB Would only really benefit CGC, ...",{'place_id': '26b9557935d73cba'},2018-12-18 00:25:39,37.881262,-122.327841,2018-12-18,California,San Francisco,6,75,2018-12-18,36,Good,PM2.5,06-075-0005,1


In [None]:
X_train.shape, y_train.shape

In [None]:
# Text regression pipeline
def train_model(X_train, y_train, X_test, y_test, params=(100, 5)):
    n_dim, k = params

    # Vectorize tweets
    # TODO: more data cleaning, lemmetization, etc
    tf_vect = TfidfVectorizer(min_df=1, stop_words="english").fit(X_train)
    X_enc = tf_vect.transform(X_train)
    X_test_enc = tf_vect.transform(X_test)
    
    # Run trunvated SVD on the vector encodings
    svd_model = TruncatedSVD(n_components=n_dim, n_iter=4, random_state=23).fit(X_enc)
    X_lsa = svd_model.transform(X_enc)
    X_test_lsa = svd_model.transform(X_test_enc)

    # Fit Regression model
    knn_model = KNeighborsRegressor(k, n_jobs=-1).fit(X_lsa, y_train)
    y_pred = knn_model.predict(X_lsa)

    return svd_model, knn_model, X_lsa, y_pred, X_test_lsa


In [None]:
svd_model, knn_model, X_lsa, y_pred, X_test_lsa = train_model(X_train, y_train, X_test, y_test)
X_lsa.shape, knn_model.score(X_lsa, y_train)

In [None]:
# Scatter the results
plt.scatter(X_lsa[:, 0], X_lsa[:, 1])
plt.title("First Two Dimensions of SVD Encoding")

In [None]:
# plot scatterplot and linear model using predictions
plt.scatter(X_lsa[:, 0], y_train, c=y_pred)

plt.title("AQI by Low Dimension Tweets")
plt.ylabel("AQI", fontsize = 12)
plt.xlabel("Low Dimension tweet", fontsize = 12)

In [None]:
# Plots predictions by their true value
def plot_reference(y_pred, y_train):
    plot = sns.scatterplot(x=y_pred, y=y_train)

    # Draw reference lines
    X = np.linspace(0, 200, 200)
    plt.plot(X, X, '--', color="#cc2727", linewidth=2.5)
    plt.plot(X, X/2, '--', color="#e85151", linewidth=2.5)
    plt.plot(X/2, X, '--', color="#e85151", linewidth=2.5)

    # Set plot style
    plot.set(ylim=(0, 240))
    plot.set(xlim=(0,240))

    plot.set_title("Sensor AQI by Predicted AQI")
    plot.set_ylabel("Sensor AQI", fontsize = 12)
    plot.set_xlabel("Model AQI", fontsize = 12)

plot_reference(y_pred, y_train)

</br>
</br>
</br>

## Experiment 2: Predicting AQI with tweets grouped by day  
----

In [14]:
# Group tweets by day before merging
sf_df_days = sf_df.group_by_day()

# Merge AQI and tweet dateframes
sf_mrg_days = pd.merge(left=sf_df_days, right=sf_aqi, left_on='date', right_on='Date', how='inner')

# extract our regression variables
X_days = sf_mrg_days['text']
y_days = sf_mrg_days['AQI']

X_train_days, X_test_days, y_train_days, y_test_days = train_test_split(X_days, y_days, test_size=0.2, random_state=23)

sf_mrg_days.head()

Unnamed: 0,text,State Name,county Name,State Code,County Code,Date,AQI,Category,Defining Parameter,Defining Site,Number of Sites Reporting
0,@ClopezFox Yep! Gonna get ready for school soo...,California,San Francisco,6,75,2018-05-01,37,Good,Ozone,06-075-0005,1
1,"@marylynn Oh, my dear lord. See how I misspell...",California,San Francisco,6,75,2018-05-02,36,Good,Ozone,06-075-0005,1
2,Ugh crazy intense dream where some cult people...,California,San Francisco,6,75,2018-05-03,36,Good,Ozone,06-075-0005,1
3,@MightyMere @CarolHoward Exhausting kinda unde...,California,San Francisco,6,75,2018-05-04,33,Good,Ozone,06-075-0005,1
4,"As someone who loved RoM1, it warms my heart t...",California,San Francisco,6,75,2018-05-05,37,Good,Ozone,06-075-0005,1


In [None]:
X_train_days.shape, y_train_days.shape

In [None]:
svd_model_days, knn_model_days, X_lsa_days, y_pred_days, X_test_lsa_days = train_model(X_train_days, y_train_days, X_test_days, y_test_days, (10000, 5))
X_lsa_days.shape, knn_model_days.score(X_lsa_days, y_train_days)

In [None]:
plot_reference(y_pred_days, y_train_days)

</br>
</br>
</br>

## Experiment 3: Predicting (classifying) AQI outliers by tweets grouped by day
----

In [None]:
X_test_days.shape, X_test_lsa_days.shape, y_test_days.shape

In [None]:
y_train_clf = y_train_days > 100
y_test_clf = y_test_days > 100

# Oversample outliers
X_lsa_days_over = np.concatenate((X_lsa_days, X_lsa_days[y_train_clf]))
y_train_clf_over = np.concatenate((y_train_clf, y_train_clf[y_train_clf]))

In [None]:
# Use random forest model to predict high aqi days
model_clf = RandomForestClassifier().fit(X_lsa_days_over, y_train_clf_over)
y_test_clf_pred = model_clf.predict(X_test_lsa_days)

model_clf.score(X_test_lsa_days, y_test_clf)

In [None]:
X = np.linspace(0, y_test_days.shape[0], y_test_days.shape[0])
plot = sns.scatterplot(x=X, y=y_test_days, hue=y_test_clf_pred)

plot.set_ylabel("Sensor AQI", fontsize = 12)
plot.set_xlabel("Sample num", fontsize = 12)


</br>
</br>
</br>

## Experiment 4: Predicting AQI by tweets using word embeddings
----

In [15]:
import keras
from keras.layers import Dense
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.utils import np_utils
import itertools
from keras import layers

In [16]:
X_train_days.shape, X_test_days.shape, y_train_days.shape, y_test_days.shape

((196,), (49,), (196,), (49,))

In [18]:
# https://nlp.stanford.edu/projects/glove/
# a function to load vector model as dictionary
def load_glove_model(File):
    glove_model = {}
    with open(File,'r',encoding='utf-8') as f:
        for line in f:
            split_line = line.split()
            word = split_line[0]
            embedding = np.array(split_line[1:], dtype=np.float64)
            glove_model[word] = embedding
    print(f"{len(glove_model)} words loaded")
    return glove_model

glove = load_glove_model('../data/glove.twitter.27B.100d.txt')
emb_size = 100 # glove vectors are 100 dim

1193514 words loaded


In [25]:
# first we convert the twitter text to word indices
max_words = 10000
tokenizer = Tokenizer(num_words=max_words, oov_token='<UNK>')
tokenizer.fit_on_texts(X_train_days)
word_index = tokenizer.word_index

In [27]:
# we want to import the vectors of Glove into the embedding_layer using
# embedding_amtrix which has a  shape of (10000, 100)
embedding_matrix = np.zeros((max_words, emb_size)) 
# we loop whole vocabulary of the imdb reviews
for word, i in word_index.items():
    # if the token is smaller than the threshold we set at import
    if i < max_words:
        # we try to get the corresponding embedding from glove
        embedding_vector = glove.get(word) 
        # if this embedding exists, we feed it into our embedding_layer matrix
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector # Words not found in the embedding index will all be zeros

In [34]:
cnn_model = Sequential()
cnn_model.add(layers.Embedding(max_words, emb_size))
cnn_model.add(layers.Conv1D(128, 5, activation='relu'))
cnn_model.add(layers.MaxPooling1D(2))
cnn_model.add(layers.Conv1D(64, 5, activation='relu'))
cnn_model.add(layers.GlobalAveragePooling1D())
cnn_model.add(layers.Dense(32, activation = "relu"))
cnn_model.add(layers.Dense(1, activation="linear")) 
cnn_model.summary()

cnn_model.compile(optimizer='adam',
              loss='mean_squared_error',
              metrics=['accuracy'])

# we load our GloVe model into the embedding layer
cnn_model.layers[0].set_weights([embedding_matrix])
# the GloVe embedding is not changed by the training
cnn_model.layers[0].trainable = False 


Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 100)         1000000   
_________________________________________________________________
conv1d_4 (Conv1D)            (None, None, 128)         64128     
_________________________________________________________________
global_average_pooling1d_2 ( (None, 128)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 32)                4128      
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 33        
Total params: 1,068,289
Trainable params: 1,068,289
Non-trainable params: 0
_________________________________________________________________


In [37]:
history = cnn_model.fit(X_train_days, y_train_days, 
                    epochs=2,
                    batch_size=1, 
                    validation_data=(X_test_days, y_test_days))

Epoch 1/2


ValueError: in user code:

    C:\Users\Nick\miniconda3\envs\bergin\lib\site-packages\keras\engine\training.py:853 train_function  *
        return step_function(self, iterator)
    C:\Users\Nick\miniconda3\envs\bergin\lib\site-packages\keras\engine\training.py:842 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    C:\Users\Nick\miniconda3\envs\bergin\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:1286 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    C:\Users\Nick\miniconda3\envs\bergin\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:2849 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    C:\Users\Nick\miniconda3\envs\bergin\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:3632 _call_for_each_replica
        return fn(*args, **kwargs)
    C:\Users\Nick\miniconda3\envs\bergin\lib\site-packages\keras\engine\training.py:835 run_step  **
        outputs = model.train_step(data)
    C:\Users\Nick\miniconda3\envs\bergin\lib\site-packages\keras\engine\training.py:787 train_step
        y_pred = self(x, training=True)
    C:\Users\Nick\miniconda3\envs\bergin\lib\site-packages\keras\engine\base_layer.py:1037 __call__
        outputs = call_fn(inputs, *args, **kwargs)
    C:\Users\Nick\miniconda3\envs\bergin\lib\site-packages\keras\engine\sequential.py:369 call
        return super(Sequential, self).call(inputs, training=training, mask=mask)
    C:\Users\Nick\miniconda3\envs\bergin\lib\site-packages\keras\engine\functional.py:414 call
        return self._run_internal_graph(
    C:\Users\Nick\miniconda3\envs\bergin\lib\site-packages\keras\engine\functional.py:550 _run_internal_graph
        outputs = node.layer(*args, **kwargs)
    C:\Users\Nick\miniconda3\envs\bergin\lib\site-packages\keras\engine\base_layer.py:1037 __call__
        outputs = call_fn(inputs, *args, **kwargs)
    C:\Users\Nick\miniconda3\envs\bergin\lib\site-packages\keras\layers\convolutional.py:249 call
        outputs = self._convolution_op(inputs, self.kernel)
    C:\Users\Nick\miniconda3\envs\bergin\lib\site-packages\tensorflow\python\util\dispatch.py:206 wrapper
        return target(*args, **kwargs)
    C:\Users\Nick\miniconda3\envs\bergin\lib\site-packages\tensorflow\python\ops\nn_ops.py:1131 convolution_v2
        return convolution_internal(
    C:\Users\Nick\miniconda3\envs\bergin\lib\site-packages\tensorflow\python\ops\nn_ops.py:1261 convolution_internal
        return op(
    C:\Users\Nick\miniconda3\envs\bergin\lib\site-packages\tensorflow\python\util\dispatch.py:206 wrapper
        return target(*args, **kwargs)
    C:\Users\Nick\miniconda3\envs\bergin\lib\site-packages\tensorflow\python\util\deprecation.py:617 new_func
        return func(*args, **kwargs)
    C:\Users\Nick\miniconda3\envs\bergin\lib\site-packages\tensorflow\python\util\deprecation.py:617 new_func
        return func(*args, **kwargs)
    C:\Users\Nick\miniconda3\envs\bergin\lib\site-packages\tensorflow\python\ops\nn_ops.py:2003 conv1d
        result = gen_nn_ops.conv2d(
    C:\Users\Nick\miniconda3\envs\bergin\lib\site-packages\tensorflow\python\ops\gen_nn_ops.py:968 conv2d
        _, _, _op, _outputs = _op_def_library._apply_op_helper(
    C:\Users\Nick\miniconda3\envs\bergin\lib\site-packages\tensorflow\python\framework\op_def_library.py:748 _apply_op_helper
        op = g._create_op_internal(op_type_name, inputs, dtypes=None,
    C:\Users\Nick\miniconda3\envs\bergin\lib\site-packages\tensorflow\python\framework\func_graph.py:599 _create_op_internal
        return super(FuncGraph, self)._create_op_internal(  # pylint: disable=protected-access
    C:\Users\Nick\miniconda3\envs\bergin\lib\site-packages\tensorflow\python\framework\ops.py:3561 _create_op_internal
        ret = Operation(
    C:\Users\Nick\miniconda3\envs\bergin\lib\site-packages\tensorflow\python\framework\ops.py:2041 __init__
        self._c_op = _create_c_op(self._graph, node_def, inputs,
    C:\Users\Nick\miniconda3\envs\bergin\lib\site-packages\tensorflow\python\framework\ops.py:1883 _create_c_op
        raise ValueError(str(e))

    ValueError: Negative dimension size caused by subtracting 5 from 1 for '{{node sequential_2/conv1d_4/conv1d}} = Conv2D[T=DT_FLOAT, data_format="NHWC", dilations=[1, 1, 1, 1], explicit_paddings=[], padding="VALID", strides=[1, 1, 1, 1], use_cudnn_on_gpu=true](sequential_2/conv1d_4/conv1d/ExpandDims, sequential_2/conv1d_4/conv1d/ExpandDims_1)' with input shapes: [1,1,1,100], [1,5,100,128].


In [None]:
score = cnn_model.evaluate(X_test_days, y_test, verbose=0)
print('Test accuracy: {:.4}'.format(score[1]))

## Conclusions

#### Review

- Random twitter data is super noisy, very few tweets relate to our topic
- LSA, small-batch cnn probably don't weigh them enough
    - i.e. to dim-reduce input data we are removing the features we are looking for
- Oversampling doesn't seem to work very well even in simple clf
- That said, regression is surprisingly effective

#### Ideas

- Topic modeling seems like a good idea to give us a way to remove noisy samples
- 