In [1]:
import pandas as pd
import numpy as np
import prophet as pt
import datetime as dt
import os
import matplotlib.pyplot as plt
from prophet.plot import add_changepoints_to_plot

Importing plotly failed. Interactive plots will not work.


In [None]:
Data_path = ''
files  = os.listdir()
columns = ["grid_num", "time", "cell_id", "sms_in", "sms_out", "call_in", "call_out", "internet"]

In [None]:
def convert(t):
    """
    Function to convert the time steops to a standard form add 
    subtracting the hour to be in the GMT time.
    """
    return dt.datetime.fromtimestamp(t / 1e3) - dt.timedelta(hours = 1)

In [None]:
#Iterate over all files
full_data = pd.DataFrame()
for f in files[7:]:
    #get day number
    day = f[-9:-4]
    #Read the file
    df = pd.read_csv(f, sep = "\t", header = None, names=columns)
    #convert time column to a readable time
    df["time"] = df["time"].apply(convert)
    df["time"] = pd.to_datetime(df["time"])
    #grouping the data (each time step we have a value for each grid)
    df = df.groupby(['time','grid_num']).sum()
    #define a new aggregated column
    df["rate"] = df.loc[:, ["sms_in", "sms_out", "call_in", "call_out", "internet"]].sum(axis = 1)
    #drop the unwanted columns
    df.drop(columns = ["cell_id","sms_in", "sms_out", "call_in", "call_out", "internet"], inplace = True)
    #Concate df with full data
    full_data = pd.concat([full_data, df])
    #save the data to a csv
    #Check point
    print('===========> Day {} is done!'.format(day))
    print('===========> Full data rows: {} rows'.format(len(full_data)))
    
full_data.to_csv('full_grid.csv')
print('===========> Full data size: {} Mb'.format(os.path.getsize('full_grid.csv')/10**6))

In [None]:
# =========== 2D Array ====================================
# Define the new data shape
time_steps = df.reset_index()['time'].unique()
grids = df.reset_index()['grid_num'].unique()

## Convert the pandas dataframe to a numpy array with shape()
# Each sample in the 4-D array is an image represent one snapshot taken at time-step
# to the city, each value represent the value of one grid at this time-step.
data = np.zeros(dtype=np.float64, shape=(len(time_steps), len(grids)//98, len(grids)//117, 1))

for i, j in enumerate(time_steps):
    data[i, :, :, :] = df.loc[j].to_numpy().reshape(98, 117, 1)

# save to npy file
np.save('full_grid.npy', data)