# Orthodox Monitoring
Assumption: Orthodox will visit families on the holiday
Problem: Most of them are infectious
Solution: Monitor their movement in the future in order to alert about TAZ that might become badly infected

Goal: Use the fact that they usually visit families on weekends to determine which TAZ will be visited by them on passover, in order to be able to point out possibly problematic TAZ

##### TODO LIST:
There are some functions that are doubled (with minor changes) when handling city sectors and TAZs. Now we need to pay attention what function was compiled last. Code should be refactored so that there will be only one type of function for both scenarios

#### But How?
0. "clean data" 
1. Read all trips at a given day and save them in a matrix
2. Determine how much each origin is important for me (orthodox rates, infected rate...)
3. Calculate how many visitors each TAZ have at a moment of interest
4. for each (TAZ, TAZ) calc how many infected are transmitted using the outputs of step 3 and 2 and sum for each TAZ -> vector

## Imports

In [1]:
# Imports
import pandas as pd
import numpy as np
import seaborn as sns
from pathlib import Path
import random
import plotly.graph_objs as go
import plotly.express as px
import os
import glob

## Constants

In [2]:
# Constants

# Paths
HAREDIM_INFO_PATH = "/data/overflow/for_noya/Data/Haredim/CitySectorHaredim.xlsx"
HAREDIM_TAZ_SHEET_NAME = "HaredimVs2630"


policy_excel_path = "/data/overflow/for_noya/Data/policyChangeDate.xlsx"
HAREDIM_OD_FILES_FOLDER_PATH = "/data/overflow/for_noya/Data/Haredim/hourly/"
OD_FILES_FOLDER_PATH = "/data/overflow/for_noya/Data/OD/hourly/"
STOPS_PATH = "/data/overflow/for_noya/Data/Stops/stops_hourly.csv"
HAREDIM_PATH = "/data/overflow/for_noya/ofek_ds/GIS/haredim-Israel-div.xlsx"
OUT_PATH = "/data/overflow/for_noya/ofek_ds/landauof/haredim_visitors_{}.csv"

# Formats
OD_COLUMNS = ["origin", "destination", "trips", "start_hour", "avg_travel_time", "avg_dist", "partition_day"] # for TAZ
HAREDIM_OD_COLUMNS = ["origin", "destination", "trips", "start_hour", "partition_day"]
INTERESTING_OD_COLUMNS = ["origin","destination","trips","start_hour"]

WEIGHTED_OD_COLUMNS = ["origin", "destination", "trips"]
FINAL_RANKS_COLUMNS = ["taz", "score"]
STOP_COLUMNS = ['partition_day', 'hour', 'stop_tazid', 'count_imsi', 'imsi_type']

# Dates and Time
FIRST_RELEVANT_DAY = pd.Timestamp(year=2020, month=2, day=1)
FRIDAY = 4
SATURDAY = 5

# START_HOUR = 14 # from when to start monitor
FRIDAY_REASONABLE_HOUR = 11
SATURDAY_REASONABLE_HOUR = 2
SHABBAT_START_HOUR = 16
SHABBAT_END_HOUR = 15

## Helper Functions

### Data Importing & Exporting Functions

In [3]:
# Get a pd.Series of TAZ and how much infected (0<=_<=1)
def get_TAZ_ranks():
#     ans = pd.Series(data=[random.random() for _ in range(1,2631)], index=[taz for taz in range(1,2631)])
    xls = pd.ExcelFile(HAREDIM_INFO_PATH)
    haredim_taz = pd.read_excel(xls, HAREDIM_TAZ_SHEET_NAME)
    ans = pd.Series(data=[1 if taz in haredim_taz.Taz2630.values else 0 for taz in range(1,2631)], 
          index=[taz for taz in range(1,2631)])
    return ans

# Get a path and return pd.Timestamp of its date using its format
def parse_path(path):
    date_as_str = path[-12:-4]
    year = int(date_as_str[:4])
    month = int(date_as_str[4:6])
    day = int(date_as_str[6:8])
    return pd.Timestamp(year=year, month=month, day=day)

# Given a pd.Timestamp, return the relevant OD file
def get_od_df_of_date(date):
    for path in glob.glob("{}/*".format(OD_FILES_FOLDER_PATH)):
        date_of_file = parse_path(path)
        if date_of_file == date:
            ans = pd.read_csv(path, header=0, names=OD_COLUMNS)[INTERESTING_OD_COLUMNS]
            ans.trips.replace({1: 7}, inplace=True)
            return ans
        
# Handle file, name and export to a CSV file
def export_csv(df, date):
    df = df[(df['trips'] > 0) & (pd.notnull(df['trips']))]
    df.to_csv(OUT_PATH.format((str(date)).replace("-", "")[:8]))

    
def check_file_exists(date):
    return os.path.isfile(OUT_PATH.format((str(date)).replace("-", "")[:8]))

### Data Processing Functions

In [4]:
# Get a multiindex OD matrix, multiply it by the scores from a TAZ_ranks file and normalize the values to be summed to 1
def handle_multiindex_ans_df(df):
    taz_ranks = get_TAZ_ranks()
    for index in df.index.levels[0]: # first index (destination)
        temp_df = df.loc[friday_od_df.index.get_level_values('destination') == index]
        temp_df = temp_df.reset_index().set_index('origin')[['trips']]
        temp_df = temp_df.mul(taz_ranks, axis=0)
        temp_df.dropna(inplace=True)

        temp_df = temp_df/temp_df.sum()

        temp_df.index.name = 'origin'
        temp_df.reset_index(inplace=True)
        temp_df['destination'] = index
        temp_df.set_index(['destination', 'origin'], drop=True, inplace=True)
        df.loc[friday_od_df.index.get_level_values('destination') == index] = temp_df
    return df


# Fill empty visitors number in a given range of ids
def fill_empty_visitors(df, low=1, high=2631,def_value=0):
    for code in range(low, high):
        if code not in df.index.values:
            df.loc[code] = def_value
    return df

# Get visitors numbers at a given hour & day
def get_visitors_number_specific_time(stops_df, date, hour):
    ans = stops_df.loc[(stops_df.partition_day == date) & 
                       (stops_df.hour == hour) & 
                       (stops_df.imsi_type == "Visitor")]
    ans = ans[['stop_tazid', 'count_imsi']]
    ans.set_index('stop_tazid', drop=True, inplace=True)
    return ans

# Get estimation of amount of visitors in each TAZ
def estimate_visitors_number(stops_df):
    visitors_pre_shabbat = get_visitors_number_specific_time(stops_df, requested_friday, SHABBAT_START_HOUR)
    visitors_pre_monitoring = get_visitors_number_specific_time(stops_df, requested_friday, FRIDAY_REASONABLE_HOUR)
    # Try to average with a value from the day after's night, might fail because the last day don't have a tomorrow
    # in the data
    visitors_night = get_visitors_number_specific_time(stops_df, requested_saturday, SATURDAY_REASONABLE_HOUR)
    visitors_est = visitors_pre_shabbat.sub(visitors_pre_monitoring)
    if len(visitors_night) > 0:
        visitors_est = (visitors_est.add(visitors_night))/2
    else:
        visitors_est = visitors_pre_shabbat.sub(visitors_pre_monitoring)

    # Fill missing values
    visitors_est = fill_empty_visitors(visitors_est)
#     for taz in range(1,2631):
#         if taz not in visitors_est.index.values:
#             visitors_est.loc[taz] = 0
            
    return visitors_est


# See how many areas had less visitors during shabbat compared to our estimation
def check_estimations(visitors_mid_shabbat, visitors_estimated):
    visitors_mid_shabbat.sort_index(inplace=True)
    visitors_estimated.sort_index(inplace=True)
    
    return (visitors_mid_shabbat<visitors_subbed).sum()

### Program Flow Functions

In [5]:
# TODO ?

# OLD

#### But How?
0. "clean data" 
1. Find what file keeps orthodox rate of each TAZ -> vector
2. Find what file holds weekend trips for each (TAZ, TAZ) -> n * matrix
3. Determine how important each trip (new is more relevant than old and stuff) -> a matrix
4. for each (TAZ, TAZ) calc how many infected are transmitted using the outputs of step 3 and 1 and sum for each TAZ -> vector

In [1]:
# Imports
import pandas as pd
import numpy as np
import seaborn as sns
from pathlib import Path
import random
import plotly.graph_objs as go
import plotly.express as px
import os
import glob

In [50]:
# Constants

# Paths
TAZ_path = "/data/overflow/for_noya/Data/locations.csv"
out_files_folder_path = "/data/overflow/for_noya/ofek_ds/landauof/"
policy_excel_path = "/data/overflow/for_noya/Data/policyChangeDate.xlsx"
HAREDIM_OD_FILES_FOLDER_PATH = "/data/overflow/for_noya/Data/Haredim/hourly/"
OD_FILES_FOLDER_PATH = "/data/overflow/for_noya/Data/OD/hourly/"
STOPS_PATH = "/data/overflow/for_noya/Data/Stops/stops_hourly.csv"
HAREDIM_PATH = "/data/overflow/for_noya/ofek_ds/GIS/haredim-Israel-div.xlsx"
OUT_PATH = "/data/overflow/for_noya/ofek_ds/landauof/haredim_visitors_{}.csv"
OUT_CITY_PATH = "/data/overflow/for_noya/ofek_ds/landauof/haredim_visitors_city_{}.csv"

# Formats
OD_COLUMNS = ["origin", "destination", "trips", "start_hour", "avg_travel_time", "avg_dist", "partition_day"] # for TAZ
HAREDIM_OD_COLUMNS = ["origin", "destination", "trips", "start_hour", "partition_day"]

WEIGHTED_OD_COLUMNS = ["origin", "destination", "trips"]
FINAL_RANKS_COLUMNS = ["taz", "score"]
STOP_COLUMNS = ['partition_day', 'hour', 'stop_tazid', 'count_imsi', 'imsi_type']

# Dates and Time
FIRST_RELEVANT_DAY = pd.Timestamp(year=2020, month=2, day=1)
FRIDAY = 4
SATURDAY = 5

# START_HOUR = 14 # from when to start monitor
FRIDAY_REASONABLE_HOUR = 11
SATURDAY_REASONABLE_HOUR = 3
SHABBAT_START_HOUR = 16
SHABBAT_END_HOUR = 15

## UPDATED 4.7.2020 - Calculation of orthodox visitors BY CITY SECTOR ID

### Helper Functions

In [47]:
# UPDATED 4.7.2020 - Calculation of orthodox visitors BY CITY SECTOR ID


def get_od_df_of_date(date):
    for path in glob.glob("{}/*".format(HAREDIM_OD_FILES_FOLDER_PATH)):
        date_of_file = parse_path(path)
        if date_of_file == date:
            ans = pd.read_csv(path, header=0, names=HAREDIM_OD_COLUMNS)[["origin","destination","trips","start_hour"]]
            ans.trips.replace({1: 7}, inplace=True)
            return ans


def get_stops_df():
    stops_df = pd.read_csv(STOPS_PATH, header=0, names=STOP_COLUMNS, parse_dates=['partition_day'])
    taz_to_city = pd.read_excel("/data/overflow/for_noya/ofek_ds/GIS/Taz2CitySector.xlsx")
    taz_to_city = dict(zip(taz_to_city.Taz2630, taz_to_city.Over8Population))
    stops_df['city_code'] = stops_df['stop_tazid'].map(taz_to_city)
    stops_df.drop('stop_tazid', 1, inplace=True)
    return stops_df

def normalize_multiindex_ans_df(df):
    for index in df.index.levels[0]: # first index (destination)
        temp_df = df.loc[df.index.get_level_values('destination') == index]

        temp_df = temp_df/temp_df.sum()

        temp_df.index.name = 'origin'
        temp_df.reset_index(inplace=True)
        temp_df['destination'] = index
        temp_df.set_index(['destination', 'origin'], drop=True, inplace=True)
        df.loc[friday_od_df.index.get_level_values('destination') == index] = temp_df
    return df

# returns a df with how many visitors at a given time in per city
def get_visitors_of_time(stops_df, date, hour):
    ans = stops_df.loc[(stops_df.partition_day == date) & 
                                    (stops_df.hour == hour) & 
                                    (stops_df.imsi_type == "Visitor")]
    ans = ans[['count_imsi','city_code']]
    ans = ans.groupby('city_code').sum()
    return ans


# Fill empty visitors number in a given range of codes
def fill_empty_visitors(df, low=400000, high=401000,def_value=0):
    for code in range(low, high):
        if code not in df.index.values:
            df.loc[code] = 0
    return df

# See how many areas had less visitors during shabbat compared to our estimation
def check_estimations(visitors_mid_shabbat, visitors_estimated):
    visitors_mid_shabbat.sort_index(inplace=True)
    visitors_estimated.sort_index(inplace=True)
    
    return (visitors_mid_shabbat<visitors_subbed).sum()

### Create the Files

In [64]:
# UPDATED 4.7.2020 - Calculation of orthodox visitors BY CITY SECTOR ID

# STEP 1: Read 2 ODs of requested weekend (input: date)
# NOTE: 400000 is "Other"
for path in glob.glob("{}/*".format(HAREDIM_OD_FILES_FOLDER_PATH)):
    # UPDATED 4.7.2020 ~16:30 - Calculation of orthodox visitors BY TAZ ID

    # STEP 1: Read 2 ODs of requested weekend (input: date)
    requested_friday = parse_path(path)
    friday_od_df = get_od_df_of_date(requested_friday)

    requested_saturday = requested_friday + pd.Timedelta('1d')
    saturday_od_df = get_od_df_of_date(requested_saturday)

    stops_df = get_stops_df()


    # STEP 2: Sum all trips of before shabbat
    friday_od_df = friday_od_df.loc[(friday_od_df.start_hour >= FRIDAY_REASONABLE_HOUR) & 
                                 (friday_od_df.start_hour <= SHABBAT_START_HOUR)]
    friday_od_df = friday_od_df.groupby(['destination','origin'])[['trips']].sum()


    # STEP 3: Normalize
    friday_od_df = normalize_multiindex_ans_df(friday_od_df)


    # STEP 4: Sub visitors pre summing from visitors pre shabbat
    visitors_pre_shabbat = get_visitors_of_time(stops_df, requested_friday, SHABBAT_START_HOUR)
    visitors_pre_monitoring = get_visitors_of_time(stops_df, requested_friday, FRIDAY_REASONABLE_HOUR)


    visitors_subbed = visitors_pre_shabbat.sub(visitors_pre_monitoring)

    # STEP 6: Min step 5 with visitors during shabbat
    visitors_mid_shabbat = get_visitors_of_time(stops_df, requested_saturday, SATURDAY_REASONABLE_HOUR)

    visitors_mid_shabbat = fill_empty_visitors(visitors_mid_shabbat)
    visitors_subbed = fill_empty_visitors(visitors_subbed)


    # STEP 5: Multiply step 3 by total visitors from step 4
    for index in friday_od_df.index.levels[0]: # first index (destination)
        friday_od_df.loc[
            friday_od_df.index.get_level_values('destination') == index] = friday_od_df.loc[
            friday_od_df.index.get_level_values('destination') == index] * visitors_subbed.loc[index]['count_imsi']


    # Shows how mazny Taz got weird results
    # NOTE: why not taking this min and use it at level 5
    check_estimations(visitors_mid_shabbat,visitors_subbed)


    # Export to CSV
    friday_od_df = friday_od_df[(friday_od_df['trips'] > 0) & (pd.notnull(friday_od_df['trips']))]
    friday_od_df.to_csv(OUT_CITY_PATH.format((str(requested_friday)).replace("-", "")[:8]))

### Make Final DF

In [67]:
ans = pd.DataFrame(index=range(400000,401000))
for path in glob.glob(OUT_CITY_PATH.format("*")):
    # read the file
    current_date_df = pd.read_csv(path)
    # grup by origin (sum)
    current_date_df = current_date_df.groupby('destination').sum()
    current_date_df.drop('origin',axis=1,inplace=True)
    # sort by City Sector
    current_date_df.sort_index(inplace=True)
    # fill missing values with zeros
    for cs in range(400000,401000):
        if cs not in current_date_df.index.values:
            current_date_df.loc[cs,'trips'] = 0
    # append COLUMN to ans
    date_as_string = path[-12:-4] # Changed because csv instead of xlsx
    ans[date_as_string] = current_date_df.trips
#     ans = ans.append(current_date_df)

In [52]:
xls = pd.ExcelFile('/data/overflow/for_noya/Data/Haredim/CitySectorHaredim.xlsx')
df1 = pd.read_excel(xls, 'UniqueHaredimCity')
df2 = pd.read_excel(xls, 'OtherTaz2630')
df3 = pd.read_excel(xls, 'HaredimVs2630')
df4 = pd.read_excel(xls, 'Taz2CitySector')

In [70]:
ans.sum()

20200201      250.0
20200329      398.0
20200318     3186.0
20200324      929.0
20200304     5123.0
20200305     5629.0
20200217     5561.0
20200220     3463.0
20200401      830.0
20200202     5009.0
20200212     5022.0
20200331       99.0
20200402      506.0
20200321    17649.0
20200205     6104.0
20200226     5394.0
20200228     1972.0
20200317      132.0
20200322     1678.0
20200215    39317.0
20200308     1767.0
20200214      343.0
20200314    31052.0
20200309     1622.0
20200302    10733.0
20200406      552.0
20200216     5118.0
20200403      230.0
20200221      595.0
20200223     3869.0
             ...   
20200313       38.0
20200310    37973.0
20200222    36331.0
20200303      838.0
20200204     4599.0
20200311    43997.0
20200301     3076.0
20200320        5.0
20200315     2280.0
20200229    31343.0
20200224     4807.0
20200325      749.0
20200227     2919.0
20200404     4346.0
20200207        0.0
20200327        0.0
20200326      351.0
20200219     4815.0
20200208    41518.0


In [45]:
pd.read_excel("/data/overflow/for_noya/ofek_ds/GIS/Taz2CitySector.xlsx")

Unnamed: 0,Taz2630,CityNameSector,Over8Population
0,1,'שעלבים נוף איילון',400845
1,2,'תל אביב -יפו',400855
2,3,'תל אביב -יפו',400855
3,4,'תל אביב -יפו',400855
4,5,'תל אביב -יפו',400855
5,6,'תל אביב -יפו',400855
6,7,'תל אביב -יפו',400855
7,8,'תל אביב -יפו',400855
8,9,'תל אביב -יפו',400855
9,10,'תל אביב -יפו',400855


## UPDATED 4.7.2020 ~16:30 - Calculation of orthodox visitors BY TAZ ID

In [41]:
# UPDATED 4.7.2020 ~16:30 - Calculation of orthodox visitors BY TAZ ID

# STEP 1: Read 2 ODs of requested weekend (input: date)
requested_friday = pd.Timestamp('2020-02-08')
for path in glob.glob("{}/*".format(OD_FILES_FOLDER_PATH)):
    date_of_file = parse_path(path)
    if date_of_file == requested_friday:
        friday_od_df = pd.read_csv(path, header=0, names=OD_COLUMNS)[["origin","destination","trips","start_hour"]]
        friday_od_df.trips.replace({1: 7}, inplace=True)
        break
    
requested_saturday = requested_friday + pd.Timedelta('1d')
for path in glob.glob("{}/*".format(OD_FILES_FOLDER_PATH)):
    date_of_file = parse_path(path)
    if date_of_file == requested_saturday:
        saturday_od_df = pd.read_csv(path, header=0, names=OD_COLUMNS)[["origin","destination","trips","start_hour"]]
        saturday_od_df.trips.replace({1: 7}, inplace=True)
        break

        
stops_df = pd.read_csv(STOPS_PATH, header=0, names=STOP_COLUMNS, parse_dates=['partition_day'])

     
# STEP 2: Sum all trips of before shabbat
friday_od_df = friday_od_df.loc[(friday_od_df.start_hour >= FRIDAY_REASONABLE_HOUR) & 
                             (friday_od_df.start_hour <= SHABBAT_START_HOUR)]
friday_od_df = friday_od_df.groupby(['destination','origin'])[['trips']].sum()


# STEP 3: Multiply by orthodox rate and normalize
taz_ranks = get_TAZ_ranks()
for index in friday_od_df.index.levels[0]: # first index (destination)
    temp_df = friday_od_df.loc[friday_od_df.index.get_level_values('destination') == index]
    temp_df = temp_df.reset_index().set_index('origin')[['trips']]
    temp_df = temp_df.mul(taz_ranks, axis=0)
    temp_df.dropna(inplace=True)
    
    temp_df = temp_df/temp_df.sum()
    
    temp_df.index.name = 'origin'
    temp_df.reset_index(inplace=True)
    temp_df['destination'] = index
    temp_df.set_index(['destination', 'origin'], drop=True, inplace=True)
    friday_od_df.loc[friday_od_df.index.get_level_values('destination') == index] = temp_df
friday_od_df


# STEP 4: Sub visitors pre summing from visitors pre shabbat
visitors_pre_shabbat = stops_df.loc[(stops_df.partition_day == requested_friday) & 
                                    (stops_df.hour == SHABBAT_START_HOUR) & 
                                    (stops_df.imsi_type == "Visitor")]
visitors_pre_shabbat = visitors_pre_shabbat[['stop_tazid', 'count_imsi']]
visitors_pre_shabbat.set_index('stop_tazid', drop=True, inplace=True)

visitors_pre_monitoring = stops_df.loc[(stops_df.partition_day == requested_friday) & 
                                    (stops_df.hour == FRIDAY_REASONABLE_HOUR) & 
                                    (stops_df.imsi_type == "Visitor")]
visitors_pre_monitoring = visitors_pre_monitoring[['stop_tazid', 'count_imsi']]
visitors_pre_monitoring.set_index('stop_tazid', drop=True, inplace=True)

# NEWW
visitors_night = stops_df.loc[(stops_df.partition_day == requested_saturday) & 
                                    (stops_df.hour == 2) & 
                                    (stops_df.imsi_type == "Visitor")]
visitors_night = visitors_night[['stop_tazid', 'count_imsi']]
visitors_night.set_index('stop_tazid', drop=True, inplace=True)

visitors_subbed = visitors_pre_shabbat.sub(visitors_pre_monitoring)
# NEWW
visitors_subbed = (visitors_subbed.add(visitors_night))/2

for taz in range(1,2631):
    if taz not in visitors_subbed.index.values:
        visitors_subbed.loc[taz] = 0

# STEP 5: Multiply step 3 by total visitors from step 4
for index in friday_od_df.index.levels[0]: # first index (destination)
    friday_od_df.loc[friday_od_df.index.get_level_values('destination') == index] = friday_od_df.loc[friday_od_df.index.get_level_values('destination') == index] * visitors_subbed.loc[index]['count_imsi']

# # STEP 6: Min step 5 with visitors during shabbat
visitors_mid_shabbat = stops_df.loc[(stops_df.partition_day == requested_saturday) & 
                                    (stops_df.hour == SATURDAY_REASONABLE_HOUR) & 
                                    (stops_df.imsi_type == "Visitor")]
visitors_mid_shabbat = visitors_mid_shabbat[['stop_tazid', 'count_imsi']]
visitors_mid_shabbat.set_index('stop_tazid', drop=True, inplace=True)
for taz in range(1,2631):
    if taz not in visitors_mid_shabbat.index.values:
        visitors_mid_shabbat.loc[taz] = 0

visitors_mid_shabbat.sort_index(inplace=True)
visitors_subbed.sort_index(inplace=True)


friday_od_df = friday_od_df[(friday_od_df['trips'] > 0) & (pd.notnull(friday_od_df['trips']))]
friday_od_df.to_csv(OUT_PATH.format((str(requested_friday)).replace("-", "")[:8]))
# # Shows how mazny Taz got weird results
# # NOTE: why not taking this min and use it at level 5
# (visitors_mid_shabbat<visitors_subbed).sum()

NameError: name 'parse_path' is not defined

In [200]:
pd.read_csv(OUT_PATH.format((str(pd.Timestamp('2020-02-08'))).replace("-", "")[:8]))

Unnamed: 0,destination,origin,trips
0,1,731,125.800000
1,1,733,62.900000
2,1,742,62.900000
3,1,938,62.900000
4,4,236,601.500000
5,8,741,255.500000
6,8,1150,255.500000
7,9,938,813.000000
8,10,241,128.375000
9,10,243,128.375000


In [189]:
# friday_od_df[friday_od_df['trips'] <0]
pd.read_csv(OUT_PATH.format((str(requested_friday)).replace("-", "")[:8])).describe()
# 14 -> 316
# 15 -> 423
# 16 -> 442

Unnamed: 0,destination,origin,trips
count,13771.0,13771.0,13771.0
mean,1010.713383,874.000436,86.032677
std,601.433026,482.63388,137.675396
min,1.0,201.0,0.083333
25%,503.0,249.0,20.589286
50%,1077.0,938.0,46.0
75%,1334.0,1165.0,97.75
max,2630.0,2486.0,3396.0


In [42]:
# Get a pd.Series of TAZ and how much infected (0<_<1)
def get_TAZ_ranks():
#     ans = pd.Series(data=[random.random() for _ in range(1,2631)], index=[taz for taz in range(1,2631)])
    xls = pd.ExcelFile('/data/overflow/for_noya/Data/Haredim/CitySectorHaredim.xlsx')
    haredim_taz = pd.read_excel(xls, 'HaredimVs2630')
    ans = pd.Series(data=[1 if taz in haredim_taz.Taz2630.values else 0 for taz in range(1,2631)], 
          index=[taz for taz in range(1,2631)])
    return ans

# Get a path and return pd.Timestamp of its date using its format
def parse_path(path):
    date_as_str = path[-12:-4]
    year = int(date_as_str[:4])
    month = int(date_as_str[4:6])
    day = int(date_as_str[6:8])
    return pd.Timestamp(year=year, month=month, day=day)

# Gets all relevant OD file names (weekends)
# Returns a list of (filenames, day of week)
def get_relevant_OD_files(date=None):
    ans=[]
    for path in glob.glob("{}/*".format(OD_FILES_FOLDER_PATH)):
        date_of_file = parse_path(path)
        if date_of_file.weekday() in [FRIDAY, SATURDAY]:
            ans.append((path, date_of_file.weekday()))
    return ans

# Build a df of how many average trips will occur on Passover
# Will be "simulated" by building a weighted df based on weekend trips (sum of trips pre-Shabbat minus trips during Shabbat)
def build_weighted_OD_df(relevant_files_and_days):
    # Make lists for all possible pairs
    o_vals, d_vals = zip(*[(x,y) for x in range(1,2631) for y in range(1,2631)])
    ans = pd.DataFrame(columns=WEIGHTED_OD_COLUMNS, data={"origin": o_vals, 'destination': d_vals})
    ans.set_index(["origin","destination"], inplace=True)
    ans.fillna(0, inplace=True)
        
    for path, day in relevant_files_and_days:
        curr_df = pd.read_csv(path, header=0, names=OD_COLUMNS)[["origin","destination","trips","start_hour"]]
        curr_df.set_index(["origin","destination"], inplace=True)
        
        # Get only pre-shabbat hours
        if day==FRIDAY:
            curr_df_to_add = curr_df.loc[(curr_df.start_hour >= FRIDAY_REASONABLE_HOUR) & 
                                         (curr_df.start_hour <= SHABBAT_START_HOUR)]
            curr_df_to_add = curr_df_to_add.groupby(['origin','destination'])[['trips']].sum()
            curr_df_to_sub = curr_df.loc[curr_df.start_hour > SHABBAT_START_HOUR]
            curr_df_to_sub = curr_df_to_sub.groupby(['origin','destination'])[['trips']].sum()
            
            ans = ans.add(curr_df_to_add, fill_value=0)
            ans = ans.sub(curr_df_to_sub, fill_value=0)
        # is Saturday: Get only shabbat hours
        else:
            curr_df_to_sub = curr_df.loc[curr_df.start_hour > SHABBAT_END_HOUR]
            curr_df_to_sub = curr_df_to_sub.groupby(['origin','destination'])[['trips']].sum()
            ans = ans.sub(curr_df_to_sub, fill_value=0)
    
    return ans

# Get total number of visitors in each TAZ at Shabbat
# TODO
def get_all_taz_visitors(relevant_OD_files):
    relevant_dates = [parse_path(path) for (path,_) in relevant_OD_files]
    all_stops = pd.read_csv(STOPS_PATH, parse_dates=['partition_day'])
    return relevant_dates


# Based on weighted trip df and taz ranks, calculate a df of TAZ | score
def get_final_ranks(weighted_od, taz_ranks):
    ans = pd.DataFrame(FINAL_RANKS_COLUMNS, data={"taz": [taz for taz in range(1,2631)]})
    
    
    
    return ans

In [171]:
taz_ranks = get_TAZ_ranks()
weighted_OD = build_weighted_OD_df()

In [73]:
!ls /data/overflow/for_noya/Data/OD/hourly

hourly_output_20200201.csv  hourly_output_20200305.csv
hourly_output_20200202.csv  hourly_output_20200306.csv
hourly_output_20200203.csv  hourly_output_20200307.csv
hourly_output_20200204.csv  hourly_output_20200308.csv
hourly_output_20200205.csv  hourly_output_20200309.csv
hourly_output_20200206.csv  hourly_output_20200310.csv
hourly_output_20200207.csv  hourly_output_20200311.csv
hourly_output_20200208.csv  hourly_output_20200312.csv
hourly_output_20200209.csv  hourly_output_20200313.csv
hourly_output_20200210.csv  hourly_output_20200314.csv
hourly_output_20200211.csv  hourly_output_20200315.csv
hourly_output_20200212.csv  hourly_output_20200316.csv
hourly_output_20200213.csv  hourly_output_20200317.csv
hourly_output_20200214.csv  hourly_output_20200318.csv
hourly_output_20200215.csv  hourly_output_20200319.csv
hourly_output_20200216.csv  hourly_output_20200320.csv
hourly_output_20200217.csv  hourly_output_20200321.csv
hourly_output_20200218.csv  hourly_output_202003

In [20]:
pd.read_csv(Path("/data/overflow/for_noya/Data/Stops/stops_hourly.csv")).columns

Index(['partition_day', ' hour', ' stop_tazid', ' count_imsi', ' imsi_type'], dtype='object')

In [199]:
for path in glob.glob("{}/*".format(OD_FILES_FOLDER_PATH)):
    print(path)

/data/overflow/for_noya/Data/OD/hourly/hourly_output_20200220.csv
/data/overflow/for_noya/Data/OD/hourly/hourly_output_20200406.csv
/data/overflow/for_noya/Data/OD/hourly/hourly_output_20200308.csv
/data/overflow/for_noya/Data/OD/hourly/hourly_output_20200325.csv
/data/overflow/for_noya/Data/OD/hourly/hourly_output_20200331.csv
/data/overflow/for_noya/Data/OD/hourly/hourly_output_20200227.csv
/data/overflow/for_noya/Data/OD/hourly/hourly_output_20200305.csv
/data/overflow/for_noya/Data/OD/hourly/hourly_output_20200329.csv
/data/overflow/for_noya/Data/OD/hourly/hourly_output_20200208.csv
/data/overflow/for_noya/Data/OD/hourly/hourly_output_20200326.csv
/data/overflow/for_noya/Data/OD/hourly/hourly_output_20200304.csv
/data/overflow/for_noya/Data/OD/hourly/hourly_output_20200311.csv
/data/overflow/for_noya/Data/OD/hourly/hourly_output_20200228.csv
/data/overflow/for_noya/Data/OD/hourly/hourly_output_20200320.csv
/data/overflow/for_noya/Data/OD/hourly/hourly_output_20200203.csv
/data/over

# For Presentation

### Making haredim table

In [99]:
# Finding out movement between 6.4.2020 19:00 - 8.4.2020 17:00
# After conversation ~22:30

C_MONDAY = pd.Timestamp(year=2020, month=4, day=6)
C_TUESDAY = pd.Timestamp(year=2020, month=4, day=7)
C_WEDNESDAY = pd.Timestamp(year=2020, month=4, day=8)

MONITORING_STARTING_HOUR_BEG = 1
MONITORING_STARTING_HOUR_END = 4
MONITORING_ENDING_HOUR = 10
OUT_PATH = '/data/overflow/for_noya/ofek_ds/haredim_relative_mon_pass10.csv'

In [100]:
mon_od_df = get_od_df_of_date(C_MONDAY)
tue_od_df = get_od_df_of_date(C_TUESDAY)
wed_od_df = get_od_df_of_date(C_WEDNESDAY)
stops_df = pd.read_csv(STOPS_PATH, header=0, names=STOP_COLUMNS, parse_dates=['partition_day'])

In [101]:
all_relevant_OD = mon_od_df.loc[mon_od_df.start_hour > MONITORING_STARTING_HOUR_END]
all_relevant_OD = all_relevant_OD.append(mon_od_df)
all_relevant_OD = all_relevant_OD.append(wed_od_df.loc[wed_od_df.start_hour <= MONITORING_ENDING_HOUR])
all_relevant_OD = all_relevant_OD.groupby(['destination'])[['trips']].sum()
all_relevant_OD = fill_empty_visitors(all_relevant_OD)
all_relevant_OD.sort_values('destination', inplace=True)

In [102]:
# Get haredim
taz_ranks = get_TAZ_ranks()
haredim_OD = all_relevant_OD.mul(taz_ranks,axis=0)

In [103]:
# haredim/(haredim+all)
haredim_OD = haredim_OD.div(all_relevant_OD.add(haredim_OD))
haredim_OD.columns = ['visitors']

In [104]:
# Getting total number fo visitors per TAZ
visitors_est_end = get_visitors_number_specific_time(stops_df, C_WEDNESDAY, MONITORING_ENDING_HOUR)
visitors_est_end = fill_empty_visitors(visitors_est_end)

visitors_est_beg = get_visitors_number_specific_time(stops_df, C_TUESDAY, MONITORING_STARTING_HOUR_BEG)
visitors_est_beg = fill_empty_visitors(visitors_est_beg)
visitors_est_beg_end = get_visitors_number_specific_time(stops_df, C_TUESDAY, MONITORING_STARTING_HOUR_END)
visitors_est_beg_end = fill_empty_visitors(visitors_est_beg_end)

visitors_est = visitors_est_end - (visitors_est_beg + visitors_est_beg_end)/2

visitors_est.columns = ['visitors']
visitors_est.sort_values('stop_tazid', inplace=True)

In [105]:
visitors_est.describe()

Unnamed: 0,visitors
count,2630.0
mean,87.031749
std,171.2521
min,-1200.0
25%,0.0
50%,62.5
75%,158.875
max,2538.0


In [106]:
# Multiply to get final result
haredim_OD = haredim_OD.mul(visitors_est,axis=0)

In [107]:
# Export_to_file
haredim_OD.to_csv(OUT_PATH, index=None)

In [108]:
# Try reading
pd.read_csv((OUT_PATH)).describe()

Unnamed: 0,visitors
count,2482.0
mean,1.474114
std,14.628484
min,-93.25
25%,-0.0
50%,0.0
75%,0.0
max,230.0


### Making the files

In [82]:
visitors_subbed.index.values

array([     1,      2,      3, ..., 400997, 400998, 400999])

In [121]:
for path in glob.glob("{}/*".format(OD_FILES_FOLDER_PATH)):
    # UPDATED 4.7.2020 ~16:30 - Calculation of orthodox visitors BY TAZ ID
    
    # STEP 1: Read 2 ODs of requested weekend (input: date)
    requested_friday = parse_path(path)
    
    # For improving performance (a bit)
    if check_file_exists(requested_friday):
        print("skipped {}".format(requested_friday))
        continue

    
    requested_saturday = requested_friday + pd.Timedelta('1d')

    friday_od_df = get_od_df_of_date(requested_friday)
    saturday_od_df = get_od_df_of_date(requested_saturday)
    stops_df = pd.read_csv(STOPS_PATH, header=0, names=STOP_COLUMNS, parse_dates=['partition_day'])


    # STEP 2: Sum all trips of before shabbat
    friday_od_df = friday_od_df.loc[(friday_od_df.start_hour >= FRIDAY_REASONABLE_HOUR) & 
                                 (friday_od_df.start_hour <= SHABBAT_START_HOUR)]
    friday_od_df = friday_od_df.groupby(['destination','origin'])[['trips']].sum()


    # STEP 3: Multiply by orthodox rate and normalize
    friday_od_df = handle_multiindex_ans_df(friday_od_df)


    # STEP 4: Sub visitors pre summing from visitors pre shabbat
    visitors_subbed = estimate_visitors_number(stops_df)

    # STEP 5: Multiply step 3 by total visitors from step 4
    for index in friday_od_df.index.levels[0]: # first index (destination)
        friday_od_df.loc[friday_od_df.index.get_level_values('destination') == index] = friday_od_df.loc[
            friday_od_df.index.get_level_values('destination') == index] * visitors_subbed.loc[index]['count_imsi']

    # # STEP 6: Min step 5 with visitors during shabbat
    visitors_mid_shabbat = get_visitors_number_specific_time(stops_df, requested_saturday, SATURDAY_REASONABLE_HOUR)
    visitors_mid_shabbat = fill_empty_visitors(visitors_mid_shabbat)
    print(check_estimations(visitors_mid_shabbat, visitors_subbed))

    export_csv(friday_od_df, requested_friday)

skipped 2020-02-20 00:00:00
skipped 2020-04-06 00:00:00
skipped 2020-03-08 00:00:00
skipped 2020-03-25 00:00:00
skipped 2020-03-31 00:00:00
skipped 2020-02-27 00:00:00
skipped 2020-03-05 00:00:00
skipped 2020-03-29 00:00:00
skipped 2020-02-08 00:00:00
skipped 2020-03-26 00:00:00
skipped 2020-03-04 00:00:00
skipped 2020-03-11 00:00:00
skipped 2020-02-28 00:00:00
skipped 2020-03-20 00:00:00
skipped 2020-02-03 00:00:00
skipped 2020-03-21 00:00:00
skipped 2020-02-23 00:00:00
skipped 2020-02-26 00:00:00
skipped 2020-04-05 00:00:00
skipped 2020-03-10 00:00:00
skipped 2020-02-22 00:00:00
skipped 2020-02-07 00:00:00
skipped 2020-02-04 00:00:00
skipped 2020-02-02 00:00:00
skipped 2020-03-01 00:00:00
skipped 2020-02-15 00:00:00
skipped 2020-03-27 00:00:00
skipped 2020-03-02 00:00:00
skipped 2020-02-18 00:00:00
skipped 2020-02-12 00:00:00
skipped 2020-02-11 00:00:00
skipped 2020-02-21 00:00:00
skipped 2020-02-01 00:00:00
skipped 2020-03-24 00:00:00
skipped 2020-02-16 00:00:00
skipped 2020-03-28 0

### Make Final DF

In [165]:
ans = pd.DataFrame(index=range(1,2631))
for path in glob.glob(OUT_PATH.format("2020*")):
#     if path == OUT_PATH.format("all"):
#         continue
    # read the file
    current_date_df = pd.read_csv(path)
    # grup by origin (sum)
    current_date_df = current_date_df.groupby('destination').sum()
    current_date_df.drop('origin',axis=1,inplace=True)
    # sort by TAZ
    current_date_df.sort_index(inplace=True)
    # fill missing values with zeros
    for taz in range(1,2631):
        if taz not in current_date_df.index.values:
            current_date_df.loc[taz,'trips'] = 0
    # append COLUMN to ans
    date_as_string = path[-12:-4] # Changed because csv instead of xlsx
    ans[date_as_string] = current_date_df.trips
#     ans = ans.append(current_date_df)

In [167]:
ans.to_csv(OUT_PATH.format("all"),index=False)
!ls
ans

Exploration.ipynb		haredim_visitors_20200322.xlsx
haredim_visitors_20200201.csv	haredim_visitors_20200323.csv
haredim_visitors_20200201.xlsx	haredim_visitors_20200323.xlsx
haredim_visitors_20200202.csv	haredim_visitors_20200324.csv
haredim_visitors_20200202.xlsx	haredim_visitors_20200324.xlsx
haredim_visitors_20200203.csv	haredim_visitors_20200325.csv
haredim_visitors_20200203.xlsx	haredim_visitors_20200325.xlsx
haredim_visitors_20200204.csv	haredim_visitors_20200326.csv
haredim_visitors_20200204.xlsx	haredim_visitors_20200326.xlsx
haredim_visitors_20200205.csv	haredim_visitors_20200327.csv
haredim_visitors_20200205.xlsx	haredim_visitors_20200327.xlsx
haredim_visitors_20200206.csv	haredim_visitors_20200328.csv
haredim_visitors_20200206.xlsx	haredim_visitors_20200328.xlsx
haredim_visitors_20200207.csv	haredim_visitors_20200329.csv
haredim_visitors_20200207.xlsx	haredim_visitors_20200329.xlsx
haredim_visitors_20200208.csv	haredim_visitors_20200330.csv
haredim_visitors_202

Unnamed: 0,20200407,20200224,20200406,20200219,20200310,20200402,20200319,20200326,20200204,20200318,...,20200324,20200228,20200212,20200220,20200205,20200321,20200308,20200229,20200304,20200202
1,0.0,53.5,280.0,280.0,515.5,250.0,248.5,214.0,261.0,240.0,...,256.0,233.0,274.0,222.0,179.0,235.5,165.0,387.0,126.0,229.0
2,0.0,772.0,603.5,819.5,575.5,571.0,530.5,209.5,875.0,603.0,...,588.5,765.5,827.5,823.5,800.0,620.5,888.0,389.0,819.5,849.0
3,0.0,132.0,119.5,115.5,258.5,81.0,145.5,12.0,192.0,145.5,...,85.5,325.0,173.0,126.0,152.0,140.5,220.0,38.5,147.5,0.0
4,0.0,672.0,0.0,721.0,688.5,586.0,0.0,303.5,667.5,0.0,...,0.0,0.0,581.5,603.0,633.0,0.0,648.0,447.5,629.0,718.5
5,0.0,0.0,89.5,0.0,0.0,104.5,98.0,38.0,68.0,16.5,...,31.5,108.5,0.0,25.5,0.0,0.0,76.5,2.0,10.0,0.0
6,0.0,44.5,0.0,0.0,143.0,0.0,0.0,0.0,121.5,51.0,...,87.5,136.5,53.5,85.5,59.5,0.0,28.0,15.0,27.5,27.5
7,0.0,141.0,0.0,55.0,257.0,162.5,0.0,0.0,241.5,209.5,...,151.5,196.5,0.0,75.0,149.5,0.0,321.0,143.0,231.0,280.0
8,0.0,237.0,170.5,301.5,438.5,0.0,261.0,47.0,0.0,0.0,...,282.0,325.0,359.0,280.0,376.5,0.0,340.0,295.0,363.5,359.5
9,0.0,655.0,611.5,729.5,860.5,520.0,507.0,124.5,625.0,556.0,...,612.0,415.0,708.5,734.0,637.0,0.0,729.5,0.0,1005.5,656.5
10,0.0,740.0,353.0,579.5,348.5,306.0,295.0,141.0,564.5,0.0,...,0.0,331.5,535.0,547.5,545.5,327.0,502.5,145.0,565.0,580.0


In [174]:
ans['20200401'].sum()
# ! read haredim_visitors_all.xlsx

928385.5

# Debug Stuff

In [156]:
path = (OUT_PATH.format("20200406"))
current_date_df = pd.read_csv(path)
# grup by origin (sum)
current_date_df = current_date_df.groupby('destination').sum()
current_date_df.drop('origin',axis=1,inplace=True)
# sort by TAZ
current_date_df.sort_index(inplace=True)
# fill missing values with zeros
for taz in range(1,2631):
    if taz not in current_date_df.index.values:
        current_date_df.loc[taz,'trips'] = 0
# append COLUMN to ans
date_as_string = path[-12:-4] # Changed because csv instead of xlsx
ans[date_as_string] = current_date_df.trips
#     ans = ans.append(current_date_df)

In [150]:
current_date_df.max()
ans[date_as_string] = current_date_df.trips
ans

Unnamed: 0,20200201,20200329,20200318,20200324,20200304,20200407,20200305,20200217,20200224,20200406,...,20200320,20200206,20200229,20200405,20200227,20200218,20200323,20200208,20200209,20200327
1,239.0,248.0,240.0,256.0,126.0,0.0,140.5,154.0,0.0,280.0,...,132.5,0.0,387.0,0.0,252.5,0.0,0.0,0.0,0.0,158.0
2,494.0,665.5,603.0,588.5,819.5,0.0,642.0,768.5,0.0,603.5,...,333.5,0.0,389.0,0.0,706.0,0.0,0.0,0.0,0.0,552.0
3,158.5,0.0,145.5,85.5,147.5,0.0,220.0,169.0,0.0,119.5,...,0.0,0.0,38.5,0.0,239.0,0.0,0.0,0.0,0.0,0.0
4,0.0,642.0,0.0,0.0,629.0,0.0,592.5,0.0,0.0,0.0,...,0.0,0.0,447.5,0.0,0.0,0.0,0.0,0.0,0.0,625.0
5,76.5,0.0,16.5,31.5,10.0,0.0,0.0,34.0,0.0,89.5,...,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,51.0,87.5,27.5,0.0,0.0,87.5,0.0,0.0,...,0.0,0.0,15.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,149.5,209.5,151.5,231.0,0.0,68.5,130.5,0.0,0.0,...,136.5,0.0,143.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,477.0,321.0,0.0,282.0,363.5,0.0,291.0,312.0,0.0,170.5,...,130.0,0.0,295.0,0.0,351.0,0.0,0.0,0.0,0.0,0.0
9,793.5,561.0,556.0,612.0,1005.5,0.0,825.5,763.5,0.0,611.5,...,0.0,0.0,0.0,0.0,749.0,0.0,0.0,0.0,0.0,0.0
10,536.5,295.0,0.0,0.0,565.0,0.0,580.0,518.0,0.0,353.0,...,100.5,0.0,145.0,0.0,623.0,0.0,0.0,0.0,0.0,152.0
