In [138]:

import cognicity_text_loader as c
import logging
from sqlalchemy import create_engine
DATABASE = "riskmap"
engine = create_engine("postgresql://postgres:postgres@localhost:5432/"+ DATABASE)
 
LOGGER = logging.getLogger()
LOGGER.setLevel(logging.DEBUG)

formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")

TEST_LOG_FILENAME = ".log_filename.log"
fh = logging.FileHandler(TEST_LOG_FILENAME)
fh.setLevel(logging.DEBUG)
fh.setFormatter(formatter)
LOGGER.addHandler(fh)

ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
ch.setFormatter(formatter)
LOGGER.addHandler(ch)

config = {
    "database_engine": engine,
    "database_name": DATABASE,
    "location": "ch",
    "data_folder_prefix": "default_data",
    "logger": LOGGER
}
loader = c.CognicityTextLoader(config)

2019-07-24 10:48:18,727 - DEBUG - CognicityImageLoader constructed
2019-07-24 10:48:18,727 - DEBUG - CognicityImageLoader constructed
2019-07-24 10:48:18,727 - DEBUG - CognicityImageLoader constructed
2019-07-24 10:48:18,727 - DEBUG - CognicityImageLoader constructed
2019-07-24 10:48:18,727 - DEBUG - CognicityImageLoader constructed


In [139]:
text_data = loader.get_data()
text_data

Unnamed: 0_level_0,text
pkey,Unnamed: 1_level_1
155,Jjdithlh
158,Flooding and waterlogged streets
159,Flooding and waterlogged streets
160,Flood
161,Tree falan\n
167,Water stagnated
169,Waterlogging near cathedral road flyover
171,1st street Engineers avenue
173,Not that much water safe only
174,50cm water stagnant on the road


In [140]:
import numpy as np

def prepare_text(inp):
    '''
    returns a list of strings where each string is a different token
    '''
    # TODO replace this with regex?
    return inp.lower().replace('.', ' ').replace(',', ' ').split()
    
    
def count_frequency(pkey_text_df):
    '''
    Params: 
        pkey_text_df: pandas dataframe with index as pkey and one column named 'text'
    
    Returns:
        a dictionary of (str:int) that represents how often each 
        string is repeated in the text
    '''
    # TODO do we really need this?
    pass

def make_vocab(pkey_text_df):
    # go through all report texts creating set
    # of all possible words
    vocab = dict()
    occur = dict()
    index = 0
    reports_dict = dict()
    for row in pkey_text_df.iterrows():
        report_text_list = prepare_text(row[1]['text'])
        pkey = row[0]
        reports_dict[pkey] = report_text_list
        for word in report_text_list:
            if word not in vocab:
                vocab[word] = index
                index += 1
            if word not in occur:
                occur[word] = 1
            else:
                occur[word] += 1
    return (vocab, occur, reports_dict)
            
vocab, occur, reports_dict = make_vocab(text_data)
            
def make_unary_feature_vector(vocab, report_text_list):
    res = np.zeros((len(vocab), 1))
    for word in report_text_list:
        if word in vocab:
            res[vocab[word]][0] = 1
    return res

def make_feature_matrix(vocab, reports_dict, pos_pkeys, neg_pkeys):
    """
    params:
        vocab (dict): dictionary from word to index location along the
                  column vector
        report_texts: dictionary from pkeys to text
        pos_pkeys (set): membership in this set means the pkey should be 
                         labeled as postive
    Returns:
        tuple of (feature_matrix, labels)
        feature_matrix is a matrix of pkeys with associated feature columns underneath
        pkey      | pkey
        feat_vect | feat
        size: (len(vocab)+1, num_reports)
        labels is a matrix of size (1, num_reports) where report i is +1 if 
            pkey matching that index is in pos_pkeys and -1 else
    """
    
    labels = np.zeros((1, (len(reports_dict))))
    feature_matrix = np.zeros((len(vocab)+1, len(reports_dict)))
    i = 0
    for pkey, word_list in reports_dict.items():
        col = make_unary_feature_vector(vocab, word_list)
        col_w_pkey = np.insert(col, 0, [pkey], 0)
        feature_matrix[:, i] = col_w_pkey[:,0]
        if pkey in pos_pkeys:
            labels[0, i] = 1
        else: 
            labels[0, i] = -1
        i += 1
    return (feature_matrix, labels)

In [141]:
import pandas as pd
start_known_flood = "'2017-11-01 00:00:35.630000+05:30'" 
end_known_flood = "'2017-11-07 00:00:35.630000+05:30'"

def __get_flood_pkeys(start_date, end_date, engine):
    # gets the pkeys of reports during flood dates
    from sqlalchemy import text

    pkeys = pd.read_sql_query('''
        SELECT pkey, created_at FROM ''' + config['database_name'] + '''.all_reports WHERE
            created_at > %(start_date)s::timestamptz
                AND 
            created_at < %(end_date)s::timestamptz
    ''', params={"start_date": start_date, "end_date": end_date}, con=engine, index_col="pkey")
    
    return set(pkeys.index)
    
def __get_no_flood_pkeys(start_flood_date, end_flood_date, engine):
    # gets the pkeys of reports outside dates
    from sqlalchemy import text

    pkeys = pd.read_sql_query('''
        SELECT pkey, created_at FROM ''' + config['database_name'] + '''.all_reports WHERE
            created_at < %(start_date)s::timestamptz
                OR 
            created_at > %(end_date)s::timestamptz
    ''', params={"start_date": start_flood_date, "end_date": end_flood_date}, con=engine, index_col="pkey")

    return set(pkeys.index)

flood = __get_flood_pkeys(start_known_flood, end_known_flood, config['database_engine'])
no_flood = __get_no_flood_pkeys(start_known_flood, end_known_flood, config['database_engine'])
percent = len(flood)/(len(flood)+len(no_flood))
print("percent of flooding: ", percent)


percent of flooding:  0.75


In [142]:
chennai = pd.read_sql_query('''
    SELECT pkey, created_at, text, image_url, disaster_type, report_data, tags FROM ''' + config["database_name"] + '''.all_reports 
''', params={"start_date": start_known_flood, "end_date": end_known_flood}, con=config["database_engine"], index_col="pkey")
pd.options.display.max_rows = None
pd.options.display.max_colwidth = 1000
#from IPython.display import display
#chennai

In [143]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
import image_recognition.perceptron as ml

In [144]:
features_w_pkey, labels = make_feature_matrix(vocab, reports_dict, flood, no_flood)
features = features_w_pkey[1:, :]

th, th0 = ml.perceptron(features, labels, params={"T":1000})

T:  0
score:  [[0.24695122]]
T:  200
score:  [[0.97256098]]
T:  400
score:  [[0.97256098]]
T:  600
score:  [[0.97256098]]
T:  800
score:  [[0.97256098]]


In [145]:
features_w_pkey

array([[521., 533., 535., ..., 507., 508., 509.],
       [  0.,   0.,   0., ...,   0.,   0.,   0.],
       [  1.,   1.,   0., ...,   0.,   0.,   0.],
       ...,
       [  0.,   0.,   0., ...,   0.,   0.,   0.],
       [  0.,   0.,   0., ...,   0.,   0.,   0.],
       [  0.,   0.,   0., ...,   0.,   0.,   0.]])

In [146]:
res = np.dot(features.T, th) + th0
predicted_flood = (res > 0).T[0,:]

preds = np.vstack((features_w_pkey[0,:], predicted_flood)).T
preds

pred_w_pkey = pd.DataFrame(data=np.bool_(preds[:,1]), index=np.int_(preds[:,0]), columns=['predicted_flood'])
total_data = pd.concat([chennai,pred_w_pkey], axis=1, join='inner')

def f(inp):
    return inp.get('flood_depth', None)

total_data['flood_depth'] = total_data.report_data.apply(f)
# total_data['flood_depth'] =  pd.io.json.json_normalize(total_data.report_data, meta='flood_depth')['flood_depth']

total_data.drop(['tags'], axis=1)
# total_data['created_at'] = pd.to_datetime(total_data['created_at'], utc=True)
# total_data['created_at'] = pd.to_datetime(total_data['created_at'])
total_data

Unnamed: 0,created_at,text,image_url,disaster_type,report_data,tags,predicted_flood,flood_depth
155,2017-10-19 15:02:04.299000-04:00,Jjdithlh,,flood,"{'flood_depth': 42, 'report_type': 'flood'}","{'local_area_id': '162', 'instance_region_code': 'chn'}",False,42.0
158,2017-10-27 15:49:29.880000-04:00,Flooding and waterlogged streets,,flood,"{'flood_depth': 46, 'report_type': 'flood'}","{'local_area_id': '21', 'instance_region_code': 'chn'}",False,46.0
159,2017-10-27 16:37:12.300000-04:00,Flooding and waterlogged streets,,flood,"{'flood_depth': 72, 'report_type': 'flood'}","{'local_area_id': 'null', 'instance_region_code': 'null'}",False,72.0
160,2017-10-30 12:17:38.042000-04:00,Flood,,flood,"{'flood_depth': 44, 'report_type': 'flood'}","{'local_area_id': '156', 'instance_region_code': 'chn'}",True,44.0
161,2017-10-30 12:20:40.299000-04:00,Tree falan\n,,prep,{'report_type': 'treeclearing'},"{'local_area_id': '178', 'instance_region_code': 'chn'}",False,
167,2017-11-01 08:05:37.623000-04:00,Water stagnated,,flood,"{'flood_depth': 14, 'report_type': 'flood'}","{'local_area_id': '156', 'instance_region_code': 'chn'}",True,14.0
169,2017-11-01 08:47:48.620000-04:00,Waterlogging near cathedral road flyover,https://images.riskmap.in/1b229c90-66ec-4f11-bff7-6a20a6b50f81.jpg,flood,"{'flood_depth': 25, 'report_type': 'flood'}","{'local_area_id': '129', 'instance_region_code': 'chn'}",True,25.0
171,2017-11-01 19:03:54.168000-04:00,1st street Engineers avenue,https://images.riskmap.in/7cb82520-a663-4540-9198-b7ceeba3e3db.jpg,flood,"{'flood_depth': 76, 'report_type': 'flood'}","{'local_area_id': 'null', 'instance_region_code': 'chn'}",True,76.0
173,2017-11-01 21:59:18.828000-04:00,Not that much water safe only,,flood,"{'flood_depth': 2, 'report_type': 'flood'}","{'local_area_id': '194', 'instance_region_code': 'chn'}",True,2.0
174,2017-11-02 02:35:53.980000-04:00,50cm water stagnant on the road,,prep,{'report_type': 'drain'},"{'local_area_id': 'null', 'instance_region_code': 'chn'}",True,


In [179]:
%matplotlib widget



In [164]:
import matplotlib.pyplot as plt
#total_data = total_data.set_index(['created_at'], append=True, drop=False)
#plt.plot(total_data['created_at'], total_data['flood_depth'])
#total_data.loc[total_data.index < 400, :]
plot_data = total_data.loc[total_data.index < 600, :].copy()

# let's color pred flood as green, pred noFlood as red
plot_data.loc[plot_data.predicted_flood == True, 'c'] = 'green'
plot_data.loc[plot_data.predicted_flood == False, 'c'] = 'red'


#plot_data['c'] = 'red'
# plot_data.plot(x='created_at', y='flood_depth', color='blue', style='.')
plot_data['created_at'] = pd.to_datetime(plot_data.created_at, utc=True)

# utc -> Timestamp('2017-10-19 19:02:04.299000+0000', tz='UTC')
#plot_data[['created_at','flood_depth', 'c', 'predicted_flood']]

In [178]:
import matplotlib
matplotlib.get_backend()

'nbAgg'

In [177]:
path = plt.scatter(plot_data['created_at'], plot_data['flood_depth'], c=plot_data['c'], picker=True)
ax = path.axes
fig = ax.get_figure()
bottom_disp = ax.text(0,0, "", va="bottom", ha="left")
def onpick(event):
    index = event.ind
    row = plot_data.loc[index]
    ax.set_title("HI")
    bottom_disp.set_text("HELLO WORLD")
    
path.set_picker(onpick)
fig.canvas.mpl_connect('pick_event', onpick)

<IPython.core.display.Javascript object>

8

In [None]:
plot_data
#ml.score(features, labels, th, th0) # this is the number classified as positive

In [None]:
# here's how to see the weight of a particular word,
index_of_water = vocab['water']
th[index_of_water, :] # if this is positive it is correlated w/ flooding

th[vocab['flooded'], :] # if this is positive it is correlated w/ flooding

how_pos = dict()
reverse_dict = dict()
for key, value in vocab.items():
    how_pos[key] = int(th[value, :])
    reverse_dict[value] = key

how_pos

#for each in 'Nearby water body is overflowing'.lower().split():
#    print(each + ' ' + str(how_pos[each]))
    
for each in 'Spic Nagar rain water intrution from the velachery main road'.lower().split():
    print(each + ' ' + str(how_pos[each]))
    



In [None]:
def find_top_and_bot_n_words(n, index_to_word, th):
#    if n < 0:
#        n = len(index_to_word)-n
    #partition = np.argpartition(th[:,0], [-n, n]) #[-n:]
    #top_n_indicies = partition[-n:]
    #bottom_n_indicies = partition[:n]
    partition = np.argsort(th[:,0], axis=0) #[-n:]
    top_n_indicies = partition[:n]
    bottom_n_indicies = partition[-n:]
    all_ind = np.hstack((top_n_indicies, bottom_n_indicies))
    res =  []
    for each in all_ind:
        res.append((reverse_dict[each], th[each]))
    return res
    
top = find_top_and_bot_n_words(200, reverse_dict, th)
for each in top:
    word, score = each
    occ = occur[word]
    freq = occ/len(occur)
    print(each, freq)

In [None]:
a = np.argsort(th[:,0])
th[a]