In [None]:
#step1: format data file to simulate data operation with publisher and consumer

import pandas as pd

# Load your dataset
file_path = "hourly_transportation_202002.csv"  # Replace with your dataset's path
data = pd.read_csv(file_path)
data = data.sample(n=100, random_state=43)
# Combine date and hour into a timestamp
data['timestamp'] = pd.to_datetime(data['transition_date']) + pd.to_timedelta(data['transition_hour'], unit='h')

# Sort data by timestamp for proper batching
data = data.sort_values(by='timestamp')

# Group data by line
lines = data['line'].unique()
data_by_line = {line: data[data['line'] == line] for line in lines}

# Display the first few rows of a specific line's data
for line, line_data in data_by_line.items():
    print(f"Line: {line}")
    print(line_data.head())
    break  # Preview one line

Line: YENIKAPI - HACIOSMAN
      Unnamed: 0 transition_date  transition_hour  transport_type_id  \
1417      145589      2020-02-01               14                  2   
3529     4714557      2020-02-15                5                  2   
3637     8449382      2020-02-26                9                  2   
4938     9253911      2020-02-28               14                  2   

     road_type                  line transfer_type  number_of_passage  \
1417     RAYLI  YENIKAPI - HACIOSMAN        Normal                 12   
3529     RAYLI  YENIKAPI - HACIOSMAN        Normal                 10   
3637     RAYLI  YENIKAPI - HACIOSMAN       Aktarma                350   
4938     RAYLI  YENIKAPI - HACIOSMAN        Normal                  1   

      number_of_passenger product_kind   transaction_type_desc   town  \
1417                   12     UCRETSIZ                Ucretsiz  FATIH   
3529                   10   INDIRIMLI1       Indirimli Abonman  SISLI   
3637                  350  

In [32]:
# step2: run this after organizing data, kafka producer function 

from kafka import KafkaProducer
import json
import pandas as pd
import time

# Callback functions
def on_success(record_metadata):
    print("Success")
    print(f"Message sent to {record_metadata.topic} partition {record_metadata.partition} offset {record_metadata.offset}")

def on_error(ex):
    print(f"Error sending message: {ex}")

# Initialize Kafka producer
producer = KafkaProducer(
    bootstrap_servers='localhost:9092',
    value_serializer=lambda x: json.dumps(
        {k: (v.isoformat() if isinstance(v, pd.Timestamp) else v) for k, v in x.items()}
    ).encode('utf-8')
)

res=[]
# Publish data line by line with callbacks
for line, line_data in data_by_line.items():
    
    hourly_batches = [group for _, group in line_data.groupby(line_data['timestamp'].dt.floor('H'))]
    
    for batch in hourly_batches:
        messages = batch.to_dict(orient='records')
        for message in messages:
            message.pop("timestamp")
            # Send message with success and error callbacks
            producer.send(
                "publictransportstream", 
                key=line.encode('utf-8'), 
                value=message
            ).add_callback(on_success).add_errback(on_error)

            print(f"Line {line}: Sent message: {message}")
            res.append(message)
          # Simulate hourly data streaming


Line YENIKAPI - HACIOSMAN: Sent message: {'Unnamed: 0': 145589, 'transition_date': '2020-02-01', 'transition_hour': 14, 'transport_type_id': 2, 'road_type': 'RAYLI', 'line': 'YENIKAPI - HACIOSMAN', 'transfer_type': 'Normal', 'number_of_passage': 12, 'number_of_passenger': 12, 'product_kind': 'UCRETSIZ', 'transaction_type_desc': 'Ucretsiz', 'town': 'FATIH', 'line_name': 'M2', 'station_poi_desc_cd': 'VEZNECILER KUZEY'}
Line YENIKAPI - HACIOSMAN: Sent message: {'Unnamed: 0': 4714557, 'transition_date': '2020-02-15', 'transition_hour': 5, 'transport_type_id': 2, 'road_type': 'RAYLI', 'line': 'YENIKAPI - HACIOSMAN', 'transfer_type': 'Normal', 'number_of_passage': 10, 'number_of_passenger': 10, 'product_kind': 'INDIRIMLI1', 'transaction_type_desc': 'Indirimli Abonman', 'town': 'SISLI', 'line_name': 'M2', 'station_poi_desc_cd': 'ATATURK OTO SANAYI KUZEY'}
Line YENIKAPI - HACIOSMAN: Sent message: {'Unnamed: 0': 8449382, 'transition_date': '2020-02-26', 'transition_hour': 9, 'transport_type_id'

KeyboardInterrupt: 

In [30]:
# use after load data from hbase the df should be a dataframe containing hbase data, if traning data is 12 months and never will be trained again you should not choose lines like in line 8

import pandas as pd
df = pd.read_csv("hourly_transportation_202003.csv")

df['timestamp'] = pd.to_datetime(df['transition_date']) + pd.to_timedelta(df['transition_hour'], unit='h')

march_df = df[df.line.isin(['HALKALI - GEBZE', 'KABATAS-BAGCILAR', 'KADIKOY-KARTAL',
       'KIRAZLI-BASAKSEHIR/METROKENT', 'LEVENT-HISAR USTU',
       'TOPKAPI-HABIBLER', 'USKUDAR-CEKMEKOY', 'YENIKAPI - HACIOSMAN',
       'YENIKAPI - HAVALIMANI', 'TAKSIM - KABATAS', 'TUNEL',
       'EYUP - PIYERLOTI', 'MACKA - TASKISLA', 'ISPER A.S',
       'KABATAS-MAHMUTBEY'])]

b=march_df.groupby(["timestamp","line","station_poi_desc_cd"]).sum()["number_of_passenger"].reset_index()

#unnecessary b_new = b[ ~(b.timestamp == "2020-03-01 00:00:00")	]


Unnamed: 0,timestamp,line,station_poi_desc_cd,number_of_passenger,day_of_week,month,hour
0,2020-03-01 00:00:00,HALKALI - GEBZE,FLORYA,1,6,3,0
1,2020-03-01 00:00:00,HALKALI - GEBZE,FLORYA AQUA,6,6,3,0
2,2020-03-01 00:00:00,HALKALI - GEBZE,OSMANGAZI,1,6,3,0
3,2020-03-01 00:00:00,HALKALI - GEBZE,PENDIK,1,6,3,0
4,2020-03-01 00:00:00,HALKALI - GEBZE,YESILKOY,6,6,3,0
...,...,...,...,...,...,...,...
140267,2020-03-31 23:00:00,YENIKAPI - HAVALIMANI,UCYUZLU,14,1,3,23
140268,2020-03-31 23:00:00,YENIKAPI - HAVALIMANI,ULUBATLI,24,1,3,23
140269,2020-03-31 23:00:00,YENIKAPI - HAVALIMANI,YENIBOSNA,23,1,3,23
140270,2020-03-31 23:00:00,YENIKAPI - HAVALIMANI,YENIKAPI,114,1,3,23


In [29]:
# data to model input converter 

import numpy as np
# Step 1: Convert timestamp to datetime and extract day of the week and month
data=b
data['timestamp'] = pd.to_datetime(data['timestamp'])
data['day_of_week'] = data['timestamp'].dt.dayofweek  # 0 = Monday, 6 = Sunday
data['month'] = data['timestamp'].dt.month
data['hour'] = data['timestamp'].dt.hour

# Step 2: Pivot the data to organize passenger numbers by station over time
pivoted_data = data.pivot_table(
    index='timestamp',
    columns=['line', 'station_poi_desc_cd'],  # Combine line and station for unique keys
    values='number_of_passenger'
).fillna(0)

# Flatten the multi-index columns
pivoted_data.columns = ['_'.join(col) for col in pivoted_data.columns]

# Step 3: Add metadata for each timestamp (day_of_week and month)
metadata = data.drop_duplicates(subset=['timestamp'])[['timestamp', 'day_of_week', 'month', "hour"]].set_index('timestamp')

# Merge pivoted data with metadata
merged_data = pivoted_data.merge(metadata, left_index=True, right_index=True)


# Step 4: Create input-output pairs for time-series prediction
# We will use a rolling window approach for the input (past N hours) and target (next hour)
def create_time_series_data(df, target_col, sequence_length):
    """
    Creates time-series data for LSTM model input and target.
    
    :param df: DataFrame with features and target
    :param target_col: Target column for prediction
    :param sequence_length: Length of the past sequence to use as input
    :return: X (input data), y (target data)
    """
    X, y = [], []
    for i in range(sequence_length, len(df) - 1):
        X.append(df.iloc[i-sequence_length:i].values)
        y.append(df.iloc[i + 1][target_col])
    return np.array(X), np.array(y)

# Define parameters
sequence_length = 24  # Use past 24 hours as input

# Prepare input (X) and target (y) for all stations
station_columns = pivoted_data.columns.tolist()
X, y = create_time_series_data(merged_data[station_columns + ['day_of_week', 'month',"hour"]], station_columns, sequence_length)

# Step 5: Split the data into train, validation, and test sets
from sklearn.model_selection import train_test_split

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Display the shapes of the datasets
X_train.shape, X_val.shape, X_test.shape, y_train.shape, y_val.shape, y_test.shape

((459, 24, 242),
 (98, 24, 242),
 (99, 24, 242),
 (459, 239),
 (98, 239),
 (99, 239))

In [31]:
# columns that in the previous month not in next march month if we use 6 months for training this is not necessary
import json

with open("hebele.json","r") as f:
    data = json.load(f)
    
    
set(data.values())  -set(merged_data.columns) 

new_columns=['KABATAS-MAHMUTBEY_CAGLAYAN BATI',
 'KABATAS-MAHMUTBEY_CAGLAYAN DOGU',
 'KABATAS-MAHMUTBEY_CIRCIR BATI',
 'KABATAS-MAHMUTBEY_KARADENIZ MAH. BATI',
 'KABATAS-MAHMUTBEY_KARADENIZ MAH. DOGU',
 'KABATAS-MAHMUTBEY_KAZIMKARABEKIR',
 'KABATAS-MAHMUTBEY_MECIDIYEKOY DOGU',
 'KABATAS-MAHMUTBEY_NURTEPE BATI',
 'KABATAS-MAHMUTBEY_NURTEPE DOGU',
 'KABATAS-MAHMUTBEY_TEKSTILKENT',
 'KABATAS-MAHMUTBEY_VEYSELKARANI DOGU',
 'KABATAS-MAHMUTBEY_YESILPINAR']

merged_data[new_columns]=np.zeros((681,12))
final_df = merged_data[list(data.values())]

final_df.to_csv("march_month_df.csv",index_label=False)

In [24]:
import pandas as pd

In [25]:
df = pd.read_csv("hourly_transportation_202002 (2).csv")

In [26]:
df[(df.road_type=="RAYLI")].shape

(2572709, 13)

In [27]:
filtered_df.line.unique()

NameError: name 'filtered_df' is not defined

In [28]:
filtered_df['timestamp'] = pd.to_datetime(filtered_df['transition_date']) + pd.to_timedelta(filtered_df['transition_hour'], unit='h')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['timestamp'] = pd.to_datetime(filtered_df['transition_date']) + pd.to_timedelta(filtered_df['transition_hour'], unit='h')


In [34]:
b=filtered_df.groupby(["timestamp","line","station_poi_desc_cd"]).sum()["number_of_passenger"].reset_index()

  b=filtered_df.groupby(["timestamp","line","station_poi_desc_cd"]).sum()["number_of_passenger"].reset_index()


In [35]:
b.to_csv("big_data_input.csv")