In [None]:
import pandas as pd
import glob
#loading test weather data
# List all the weather data files
file_pattern = 'test_set\weather_data\weather_data_*'
files = glob.glob(file_pattern)

# Read the files and store them in a list of DataFrames
dataframes = []
for file in files:
    df = pd.read_csv(file, sep='\t', header=None, names=['timestamp', 'weather', 'temperature', 'pollution'])
    dataframes.append(df)

# Combine all the DataFrames into a single DataFrame
test_weather_data=None
if dataframes:
    test_weather_data = pd.concat(dataframes)
else:
    exit('No data files found.')

# Convert the 'timestamp' column to a datetime object
test_weather_data['timestamp'] = pd.to_datetime(test_weather_data['timestamp'])

# Reset the index and sort the DataFrame by the timestamp
test_weather_data = test_weather_data.sort_values(by='timestamp').reset_index(drop=True)

print(test_weather_data.head())




In [None]:
# loading the training weather data
file_pattern = 'training_data\\weather_data\\weather_data_*'
files = glob.glob(file_pattern)

# Read the files and store them in a list of DataFrames
dataframes = []
for file in files:
    df = pd.read_csv(file, sep='\t', header=None, names=['timestamp', 'weather', 'temperature', 'pollution'])
    dataframes.append(df)

# Combine all the DataFrames into a single DataFrame
train_weather_data=None
if dataframes:
    train_weather_data = pd.concat(dataframes)
else:
    exit('No data files found.')

# Convert the 'timestamp' column to a datetime object
train_weather_data['timestamp'] = pd.to_datetime(train_weather_data['timestamp'])

# Reset the index and sort the DataFrame by the timestamp
train_weather_data = train_weather_data.sort_values(by='timestamp').reset_index(drop=True)

print(train_weather_data.head())



In [None]:
import pandas as pd
import glob
# loading the test order data
# List all the order data files
file_pattern = 'test_set/order_data/test_order_data_*'
files = glob.glob(file_pattern)

# Read the files and store them in a list of DataFrames
dataframes = []
for file in files:
    df = pd.read_csv(file, header=None, names=['order_id', 'driver_id', 'passenger_id', 'start_region_hash', 'Time'])
    dataframes.append(df)

# Combine all the DataFrames into a single DataFrame
test_order_data = None
if dataframes:
    test_order_data = pd.concat(dataframes)
else:
    exit('No data files found.')

# Convert the 'Time' column to a datetime object
test_order_data['Time'] = pd.to_datetime(test_order_data['Time'])

# Reset the index and sort the DataFrame by the timestamp
test_order_data = test_order_data.sort_values(by='Time').reset_index(drop=True)

print(test_order_data.head())


In [None]:
#loading the training order data
# read training data
# List all the training data files
file_pattern = 'training_data\\order_data\\order_data_*'
files = glob.glob(file_pattern)

# Read the files and store them in a list of DataFrames
dataframes = []
for file in files:
    df = pd.read_csv(file, header=None, sep='\t', names=['order_id', 'driver_id', 'passenger_id', 'start_region_hash', 'dest_region_hash', 'Price', 'Time'])
    dataframes.append(df)

# Combine all the DataFrames into a single DataFrame
training_order_data = None
if dataframes:
    training_order_data = pd.concat(dataframes)
else:
    exit('No data files found.')

# Convert the 'timestamp' column to a datetime object
training_order_data['Time'] = pd.to_datetime(training_order_data['Time'])

# Reset the index and sort the DataFrame by the timestamp
training_order_data = training_order_data.sort_values(by='Time').reset_index(drop=True)

print(training_order_data.head())


In [None]:
# load test cluster map
test_cluster_map = pd.read_csv('test_set\\cluster_map\\cluster_map', sep='\t', header=None, names=['region_hash', 'cluster_id'])
print(test_cluster_map.head())

In [None]:
#load training cluster map
train_cluster_map = pd.read_csv('training_data\\cluster_map\\cluster_map', sep='\t', header=None, names=['region_hash', 'cluster_id'])
print(train_cluster_map.head())


In [None]:
#load test poi data
def read_file(filename):
    region_data = {}
    with open(filename, 'r') as file:
        for line in file:
            items = line.strip().split('\t')
            region_hash = items[0]
            values = items[1:]
            region_data[region_hash] = values
    return region_data

test_poi_data = read_file('test_set\\poi_data\\poi_data')
print(test_poi_data)

In [None]:
#load training poi data
train_poi_data = read_file('training_data\\poi_data\\poi_data')
print(train_poi_data)



In [None]:

# #-------------------Dummy data

# orderdummy= pd.read_csv('D:\Python-Project\AI\dummyorder.txt', delimiter='\t')
# weatherdummy= pd.read_csv('D:\Python-Project\AI\dummyweather.txt', delimiter='\t')

# merged_df = pd.merge(weatherdummy, orderdummy, left_on='timestamp', right_on='Time',how='outer')
# merged_df = merged_df.drop(columns=['timestamp'])


In [None]:
def split_category(value):
  num = value.split(':')[0]
  #-------Storing that value in our set
    #print(num)
  if('#' in num):
      val=num.split('#')                  #-------getting the sub category only instead of main
      val=(val[1])
  else:
      val=num
  
  return val

def split_count(value):
  num = value.split(':')[1]
  return num

In [None]:
merged_df=training_order_data.copy(deep=True)
region_count={}
#-----------------------counting the total facilities in each region
for key, values in train_poi_data.items():      
        count=0
#------Looping over the list of facilities for a specific region
        for value in values:
        #-------splitting on semi colon to geth the facility type instead of the number of facility
                val=split_count(value)
                count=int(val)+count
        #print("Region: count: ",key,count)
        region_count.update({key: count})

merged_df['Facilities'] = merged_df['start_region_hash'].map(region_count)
print(merged_df)
           


In [None]:

#---------merging the merged data with cluster map to replace the region hash value with its cluster ID value
final = pd.merge(merged_df, train_cluster_map, left_on='start_region_hash', right_on='region_hash')
final = final.drop(columns=['region_hash','start_region_hash'])
final = final.rename(columns={'cluster_id': 'start_region_id'})

final = pd.merge(final, train_cluster_map, left_on='dest_region_hash', right_on='region_hash')
final = final.drop(columns=['region_hash','dest_region_hash'])
final = final.rename(columns={'cluster_id': 'dest_region_id'})


#---------extracting day of the week from the given time stamp
final['Time']=pd.to_datetime(final['Time'])
final['day']=(final['Time'].dt.day_name())

# print("column count: ",final.shape[1])
# print("row count",final.count())
#print(final.head())

#-------------Sorting the table first according to region then within those regions, sorting according to time 
final = final.sort_values(by=['start_region_id', 'Time'])



In [168]:
#----------------------Finding the Demand-supply gap for each ten minutes
#print(sorted_final)

#Group by Region and time in 10 minute intervals
training = final.groupby(['start_region_id', pd.Grouper(key='Time', freq='10min')]).agg({'order_id': 'count', 'driver_id': lambda x: x.notnull().sum(),'Facilities':'mean'}).reset_index()

#calculating the demand-gap by finding difference between total orders and given drivers
training['Gap'] = training.apply(lambda row: (row['order_id'] - row['driver_id']), axis=1)
print(training)

#------counting how many rows of each region
counts = training['start_region_id'].value_counts()
counts_df = counts.reset_index().rename(columns={'index': 'region'})
counts_df=counts_df.sort_values(by='region',ascending=True)
print(counts_df)





        start_region_id                Time  order_id  driver_id  Facilities  \
0                     1 2016-01-01 00:00:00       161        153    653376.0   
1                     1 2016-01-01 00:10:00       173        166    653376.0   
2                     1 2016-01-01 00:20:00       153        146    653376.0   
3                     1 2016-01-01 00:30:00       137        133    653376.0   
4                     1 2016-01-01 00:40:00       125        124    653376.0   
...                 ...                 ...       ...        ...         ...   
157166               66 2016-01-21 23:10:00         2          2    138942.0   
157167               66 2016-01-21 23:20:00         1          1    138942.0   
157168               66 2016-01-21 23:30:00         2          2    138942.0   
157169               66 2016-01-21 23:40:00         4          3    138942.0   
157170               66 2016-01-21 23:50:00         3          3    138942.0   

        Gap  
0         8  
1         7

In [169]:
#Grouping weather data into 10 min slots as well

weather = train_weather_data.groupby(['weather', pd.Grouper(key='timestamp', freq='10min')]).agg({'pollution': 'mean','temperature':'mean',}).reset_index()
print(weather)

      weather           timestamp  pollution  temperature
0           1 2016-01-01 00:00:00      177.0          3.5
1           1 2016-01-01 00:10:00      177.0          3.0
2           1 2016-01-01 00:20:00      177.0          3.0
3           1 2016-01-01 00:30:00      177.0          3.0
4           1 2016-01-01 00:40:00      177.0          3.0
...       ...                 ...        ...          ...
2740        9 2016-01-18 14:50:00      276.0          6.0
2741        9 2016-01-18 15:00:00      276.0          6.0
2742        9 2016-01-18 15:20:00      276.0          5.0
2743        9 2016-01-18 15:30:00      276.0          6.0
2744        9 2016-01-18 15:40:00      263.0          5.0

[2745 rows x 4 columns]


In [174]:
import pandas as pd


#---------merging weather data and order data on time
merged_df = pd.merge(weather, training, left_on='timestamp', right_on='Time',how='right')
merged_df = merged_df.drop(columns=['timestamp'])

print(merged_df)

        weather  pollution  temperature  start_region_id                Time  \
0           1.0      177.0          3.5                1 2016-01-01 00:00:00   
1           1.0      177.0          3.0                1 2016-01-01 00:10:00   
2           1.0      177.0          3.0                1 2016-01-01 00:20:00   
3           1.0      177.0          3.0                1 2016-01-01 00:30:00   
4           1.0      177.0          3.0                1 2016-01-01 00:40:00   
...         ...        ...          ...              ...                 ...   
161145      2.0       58.0          1.0               66 2016-01-21 23:10:00   
161146      2.0       58.0          1.0               66 2016-01-21 23:20:00   
161147      2.0       58.0          1.0               66 2016-01-21 23:30:00   
161148      2.0       59.0          1.0               66 2016-01-21 23:40:00   
161149      2.0       59.0          1.0               66 2016-01-21 23:50:00   

        order_id  driver_id  Facilities

In [None]:
#-----------------------------------------------------------ignore

# df_sorted = final.sort_values(by=['start_region_id'], ascending= True)

# region_count=[0]*66
# for i in range(66):                 #-------storing each regions number of rows in the corresponding index, index 0 = region 1
#     region_count[i] = final[final['start_region_id'] == i].shape[0]
#     #region_count[i] = final['start_region_id'].value_counts()[i]

# print(final)

# i=0
# for k in region_count:
#     print("For region: ",i)
#     print("number of rows: ",k)
#     for j in range(k):
#         print("indexing inside: ",j)
#     i+=1

# df_sorted.to_csv('mydata.csv', index=False)
# print(df_sorted)

In [None]:
#------Extracting the total unique facilities provided over all from given poin

# import pandas as pd
# #-------Creating a set as it stores unique values
# facilities = set()
# #-------Looping over our poid data dictionary
# for key, values in train_poi_data.items():
#         #------Looping over the list of facilities for a specific region
#     for value in values:
#                 #-------splitting on semi colon to geth the facility type instead of the number of facility
#         val=split_category(value)
#         facilities.add(int(val))        #converting to int and storing so easier to sort

# sorted_values = sorted(list(facilities))

# for col in sorted_values:
#         merged_df[col] = 0

# #print(sorted_values)
# #print(len(sorted_values))
# #print(merged_df)

# def check_facilities(vals, df, idx):
#     for f in vals:
#         value = split_category(f)
#         if int(value) in df.columns:
#             df.loc[idx, str(value)] = 1
# # ---------for each region that has the facility replace the 0 with 1
# # ----------------------to do, iterate over the data frame and for the regions of order put 1 
# #for the facilities in that region(either start or destination)
# for key, values in train_poi_data.items():
#   # print(key,values)
#   for index,row in merged_df.iterrows():
#     if(key == row['start_region_hash']):
#         check_facilities(values,merged_df,index)


# print(merged_df)

In [None]:
#checking if correcting merged and the hash values are correctly changed to cluster ID

#---------example row
# 9a864e958859b506f5f8bee9d8dfff17	orderID
# a323121d71cd5247f38a4848c2039cb1	driverID
# b9bd961ee676441d64c8748aa18efcda	passengerid
# b05379ac3f9b7d99370d443cfd5dcc28	startregion
# 52d7b69796362a8ed1691a6cc02ddde4   	destregion
# 45.0					price
# 2016-01-01 00:00:03			timestamp

# result = train_cluster_map.loc[train_cluster_map['region_hash'] == '52d7b69796362a8ed1691a6cc02ddde4']
# print(result) 
# result = final.loc[final['order_id'] == '9a864e958859b506f5f8bee9d8dfff17']
# print(result)




In [None]:
import pandas as pd

# create a sample dataframe with datetime column and values column
df = pd.DataFrame({'datetime': pd.date_range('2022-01-01 00:00:00', '2022-01-01 01:00:00', freq='1min'), 'values': range(61)})
# set the datetime column as index
print(df)
df = df.set_index('datetime')

# resample to 10-minute intervals and fill missing values with interpolation
df_10min = df.resample('10T').interpolate()

# print the result
print(df_10min)
