In [105]:
# Connection parameters
host = 'localhost'
user= 'postgres'
password='0000'
database='preprocess'  # the name of the batabase we are connecting
port = "5432"
# connection_string=f'postgresql://{user}:{password}@{host}:5432/{database}'

In [106]:
# !pip install psycopg2
# ! pip install psycopg2-binary

In [107]:
# prepare
import sqlalchemy
import psycopg2
import pandas as pd
import sqlite3
import pandas as pd
from io import StringIO
import csv
from psycopg2 import extras
import numpy as np

In [108]:
# Create a connection to the database
conn = psycopg2.connect(
    host=host,
    database=database,
    user=user,
    password=password,
    port=port
)

In [109]:
# Create a cursor object
cur = conn.cursor()

In [365]:
# Part 1: Build tables:  6 component-->6 tables (finish)

In [366]:
# 1) input itinerary component
cur.execute("""
DROP TABLE itinerary_input CASCADE
""")

In [367]:
# 1) input itinerary component
cur.execute("""
CREATE TABLE "itinerary_input" (
  "id" SERIAL PRIMARY KEY,
  "Orig_s" text NOT NULL,   
  "Dest_s" text NOT NULL,
  "depDay" varchar(10) NOT NULL
)
""")

In [368]:
# Commit the transaction for the CREATE TABLE command
conn.commit()

In [369]:
# 2) market share (pie graph) component 
# consistent with itinerary, map, and recommendation
cur.execute("""
DROP TABLE marketshare CASCADE
""")

In [370]:
cur.execute("""
CREATE TABLE "marketshare" (
  "id" SERIAL PRIMARY KEY,
  "itinerary_input_id" INTEGER NOT NULL,
  "market_share" FLOAT NOT NULL,
  FOREIGN KEY ("itinerary_input_id") REFERENCES "itinerary_input" ("id")
)
""")


In [371]:
# Commit the transaction for the CREATE TABLE command
conn.commit()

In [372]:
# 3) recommendation table component
# consistent with market share, map, and itinerary
cur.execute("""
DROP TABLE recommendation CASCADE
""")

In [373]:
cur.execute("""
CREATE TABLE "recommendation" (
  "id" SERIAL PRIMARY KEY,
  "itinerary_input_id" INTEGER NOT NULL,
  "dep_hour" INTEGER NOT NULL,
  "dep_min" INTEGER NOT NULL,
  "arr_hour" INTEGER NOT NULL,
  "arr_min" INTEGER NOT NULL,
  "option" TEXT,
  "elapstime" FLOAT NOT NULL,
  FOREIGN KEY ("itinerary_input_id") REFERENCES "itinerary_input" ("id") 
)
""")

In [374]:
# Commit the transaction for the CREATE TABLE command
conn.commit()

In [375]:
# 4) map component
# consistent with market share, itinerary and recommendation
cur.execute("""
DROP TABLE map CASCADE
""")

In [376]:
cur.execute("""
CREATE TABLE "map" (
  "id" SERIAL PRIMARY KEY,
  "itinerary_input_id" INTEGER NOT NULL,
  "Orig_s" text NOT NULL,
  "Dest_s" text NOT NULL,
  FOREIGN KEY ("itinerary_input_id") REFERENCES "itinerary_input" ("id")
)
""")

In [377]:
# Commit the transaction for the CREATE TABLE command
conn.commit()

In [39]:
# 5) result component (highest market_share itinerary info)
cur.execute("""
DROP TABLE result CASCADE
""")

In [40]:
cur.execute("""
CREATE TABLE "result" (
  "id" SERIAL PRIMARY KEY,
  "marketshare_id" INTEGER NOT NULL,
  "TOT_pax" FLOAT NOT NULL,
  "accuracy" FLOAT,
  FOREIGN KEY ("marketshare_id") REFERENCES "marketshare" ("id")
)
""")

In [41]:
# Commit the transaction for the CREATE TABLE command
conn.commit()

In [81]:
cur.execute("""
DROP TABLE other_info CASCADE
""")

In [82]:
# 6) other info component (highest market_share itinerary info)
cur.execute("""
CREATE TABLE "other_info" (
  "id" SERIAL PRIMARY KEY,
  "marketshare_id" INTEGER NOT NULL,
  "detour" FLOAT NOT NULL,
  "stops" INTEGER NOT NULL,
  "real_dist" FLOAT NOT NULL,
  FOREIGN KEY ("marketshare_id") REFERENCES "marketshare" ("id")
)
""")

In [83]:
# Commit the transaction for the CREATE TABLE command
conn.commit()

In [378]:
# Part 2: import preprocessing (Prepro_v2) dataset to tables (current)
# data: preprocessing data

In [379]:
# 1) input itinerary component
preprocessed_data = pd.read_csv('C:/Users/Alla/Desktop/苏黎世/第三学期/product/backend/data/dataprep_v2.csv')

In [380]:
# Select only the relevant columns
df_selected = preprocessed_data[['Orig_s', 'Dest_s', 'depDay']]
# Convert 'depDay' from float to string if it's not in the datetime format
df_selected['depDay'] = df_selected['depDay'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected['depDay'] = df_selected['depDay'].astype(str)


In [381]:
# Convert the DataFrame to an in-memory buffer for copying to SQL
buffer = StringIO()
df_selected.to_csv(buffer, index=False, header=False)
buffer.seek(0)

0

In [382]:
# Use copy_from to copy the data to the database
cur.copy_from(buffer, 'itinerary_input', sep=',', columns=('Orig_s', 'Dest_s', 'depDay'))
conn.commit()

In [383]:
# 2) market share (pie graph) component
# 1) input itinerary component
preprocessed_data = pd.read_csv('C:/Users/Alla/Desktop/苏黎世/第三学期/product/backend/data/dataprep_v2.csv')
df_selected = preprocessed_data[['market_share']].astype(float)

In [384]:
# Retrieve all ids from the itinerary_input table
cur.execute("SELECT id FROM itinerary_input ORDER BY id")
itinerary_input_ids = cur.fetchall()

In [385]:
# Check if the number of rows matches
if len(df_selected) <= len(itinerary_input_ids):
    # Assign the itinerary_input_id values to the market_share DataFrame
    df_selected['itinerary_input_id'] = [id[0] for id in itinerary_input_ids[:len(df_selected)]]
else:
    raise ValueError("There are more market_share entries than itinerary_input_ids available.")

In [386]:
# Prepare the data for insertion
# Assuming the market_share_data DataFrame now has two columns: 'market_share' and 'itinerary_input_id'
# Convert the DataFrame to a list of tuples
data_tuples = list(df_selected.itertuples(index=False, name=None))

In [387]:
# Insert the data into the marketshare table
insert_query = 'INSERT INTO marketshare (market_share, itinerary_input_id) VALUES %s'
extras.execute_values(cur, insert_query, data_tuples)

In [388]:
conn.commit()

In [389]:
# 3) recommendation table component
# 1) input itinerary component
preprocessed_data = pd.read_csv('C:/Users/Alla/Desktop/苏黎世/第三学期/product/backend/data/dataprep_v2.csv')
# Select only the relevant columns
df_selected = preprocessed_data[['dep_hour', 'dep_min', 'arr_hour', 'arr_min', 'elaptime']]
# define the data type 
df_selected['dep_hour'] = df_selected['dep_hour'].astype(int)
df_selected['dep_min'] = df_selected['dep_min'].astype(int)
df_selected['arr_hour'] = df_selected['arr_hour'].astype(int)
df_selected['arr_min'] = df_selected['arr_min'].astype(int)
df_selected['elaptime'] = df_selected['elaptime'].astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected['dep_hour'] = df_selected['dep_hour'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected['dep_min'] = df_selected['dep_min'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected['arr_hour'] = df_selected['arr_hour'].astype(int)
A value is trying to 

In [390]:
# Retrieve all ids from the itinerary_input table
cur.execute("SELECT id FROM itinerary_input ORDER BY id")
itinerary_input_ids = cur.fetchall()

In [391]:
# Check if the number of rows matches
if len(df_selected) <= len(itinerary_input_ids):
    # Assign the itinerary_input_id values to the df_selected DataFrame
    df_selected['itinerary_input_id'] = [id_tuple[0] for id_tuple in itinerary_input_ids[:len(df_selected)]]
    df_selected['option'] = None  # Assuming 'option' is a placeholder for actual options to be filled in later.

    # Reorder the DataFrame columns to match the table schema
    df_selected = df_selected[['itinerary_input_id', 'dep_hour', 'dep_min', 'arr_hour', 'arr_min', 'option', 'elaptime']]

    # Convert the DataFrame to an in-memory buffer for copying to SQL
    buffer = StringIO()
    df_selected.to_csv(buffer, index=False, header=False)
    buffer.seek(0)

    # Use copy_from to copy the data to the database
    cur.copy_from(buffer, 'recommendation', sep=',', columns=('itinerary_input_id', 'dep_hour', 'dep_min', 'arr_hour', 'arr_min', 'option', 'elapstime'))
    conn.commit()
else:
    raise ValueError("There are more elaptime entries than itinerary_input_ids available.")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected['itinerary_input_id'] = [id_tuple[0] for id_tuple in itinerary_input_ids[:len(df_selected)]]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected['option'] = None  # Assuming 'option' is a placeholder for actual options to be filled in later.


In [392]:
# Note: Reorder can ensure the mismatch

In [407]:
# 4) map component
# 1) input itinerary component
preprocessed_data = pd.read_csv('C:/Users/Alla/Desktop/苏黎世/第三学期/product/backend/data/dataprep_v2.csv')
# Select only the relevant columns
df_selected = preprocessed_data[['Orig_s', 'Dest_s']]

In [408]:
# Retrieve all ids from the itinerary_input table
cur.execute("SELECT id FROM itinerary_input ORDER BY id")
itinerary_input_ids = cur.fetchall()

In [409]:
# based on Orig_s to set foreign key (itinerary_input_id)
# Check if the number of rows matches
if len(df_selected) <= len(itinerary_input_ids):
    # Assign the itinerary_input_id values to the df_selected DataFrame
    df_selected['itinerary_input_id'] = [id_tuple[0] for id_tuple in itinerary_input_ids[:len(df_selected)]]

    # Reorder the DataFrame columns to match the table schema
    df_selected = df_selected[['itinerary_input_id', 'Orig_s', 'Dest_s']]
    df_selected['opinion'] = None

    # Convert the DataFrame to an in-memory buffer for copying to SQL
    buffer = StringIO()
    df_selected.to_csv(buffer, index=False, header=False)
    buffer.seek(0)

    # Use copy_from to copy the data to the database
    cur.copy_from(buffer, 'map', sep=',', columns=('itinerary_input_id', 'Orig_s', 'Dest_s'))
    conn.commit()
else:
    raise ValueError("There are more entries than itinerary_input_ids available.")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected['itinerary_input_id'] = [id_tuple[0] for id_tuple in itinerary_input_ids[:len(df_selected)]]


In [66]:
# 5) result component (highest market_share itinerary info)
preprocessed_data = pd.read_csv('C:/Users/Alla/Desktop/苏黎世/第三学期/product/backend/data/dataprep_v2.csv')
# Select only the relevant columns
df_selected = preprocessed_data[['TOT_pax']]

In [67]:
# Retrieve all ids from the recommendation table
cur.execute("SELECT id FROM marketshare ORDER BY id")
marketshare_ids = cur.fetchall()

In [68]:
if len(df_selected) <= len(marketshare_ids):
    # Assign the itinerary_input_id values to the df_selected DataFrame
    df_selected['marketshare_id'] = [id_tuple[0] for id_tuple in marketshare_ids[:len(df_selected)]]

    # Reorder the DataFrame columns to match the table schema
    df_selected = df_selected[['marketshare_id', 'TOT_pax']]
    # Make sure to assign np.nan instead of None to create proper NaN values for float columns
    df_selected['accuracy'] = np.nan

    # Convert the DataFrame to an in-memory buffer for copying to SQL
    buffer = StringIO()
    df_selected.to_csv(buffer, index=False, header=False, na_rep='\\N') 
    # Use na_rep to replace NaN with \N in the CSV
    buffer.seek(0)

    # Use copy_from to copy the data to the database
    cur.copy_from(buffer, 'result', sep=',', columns=('marketshare_id', 'TOT_pax', 'accuracy'))
    conn.commit()
else:
    raise ValueError("There are more entries than fk_constrain available.")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected['marketshare_id'] = [id_tuple[0] for id_tuple in marketshare_ids[:len(df_selected)]]


In [110]:
# 6) other info component (highest market_share itinerary info)
# 1) input itinerary component
preprocessed_data = pd.read_csv('C:/Users/Alla/Desktop/苏黎世/第三学期/product/backend/data/dataprep_v2.csv')

In [111]:
# Create a function to determine the number of stops based on the columns
def calculate_stops(row):
    # Check each 'stops_X.X' column starting from the highest
    for i in range(2, -1, -1):
        if row[f'stops_{i}.0'] == 1:
            return i
    return 0  # Return 0 if none of the stops columns have a 1

In [112]:
# Apply the function to each row to create the 'stops' column
preprocessed_data['stops'] = preprocessed_data.apply(calculate_stops, axis=1)

In [113]:
# Select only the relevant columns
df_selected = preprocessed_data[['detour', 'stops', 'real_dist']]
# define the data type 
df_selected['detour'] = df_selected['detour'].astype(float)
df_selected['stops'] = df_selected['stops'].astype(int)
df_selected['real_dist'] = df_selected['real_dist'].astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected['detour'] = df_selected['detour'].astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected['stops'] = df_selected['stops'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected['real_dist'] = df_selected['real_dist'].astype(float)


In [114]:
# Retrieve all ids from the recommendation table
cur.execute("SELECT id FROM marketshare ORDER BY id")
marketshare_ids = cur.fetchall()

In [115]:
if len(df_selected) <= len(marketshare_ids):
    # Assign the itinerary_input_id values to the df_selected DataFrame
    df_selected['marketshare_id'] = [id_tuple[0] for id_tuple in marketshare_ids[:len(df_selected)]]

    # Reorder the DataFrame columns to match the table schema
    df_selected = df_selected[['marketshare_id', 'detour', 'stops', 'real_dist']]

    # Convert the DataFrame to an in-memory buffer for copying to SQL
    buffer = StringIO()
    df_selected.to_csv(buffer, index=False, header=False) 
    buffer.seek(0)

    # Use copy_from to copy the data to the database
    cur.copy_from(buffer, 'other_info', sep=',', columns=('marketshare_id', 'detour', 'stops', 'real_dist'))
    conn.commit()
else:
    raise ValueError("There are more entries than fk_constrain available.")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected['marketshare_id'] = [id_tuple[0] for id_tuple in marketshare_ids[:len(df_selected)]]


In [116]:
# Close the cursor and connection
cur.close()
conn.close()