In [2]:
import pandas as pd
import psycopg2
import warnings
import numpy as np
from sqlalchemy import create_engine
from sqlalchemy.sql import text
from psycopg2 import extras

warnings.filterwarnings("ignore")

In [3]:
# Предварительно необходимо создать БД hw1:
postgres_str = f'postgresql://postgres:geheim@localhost:5432/hw1'
cnx = create_engine(postgres_str)

In [4]:
table_names = ["customer", "address", "product", "transaction"]
create_customer_tbl_q = """CREATE TABLE customer (
  id serial primary key,
  first_name varchar(20) not null,
  last_name varchar(20),
  gender varchar(6) not null,
  dob date,
  job_title text,
  job_industry_category text,
  wealth_segment text not null,
  deceased_indicator char(1) not null,
  owns_car varchar(3) not null
)
"""
create_address_tbl_q = """CREATE TABLE address (
  id serial primary key,
  address text not null,
  postcode char(4) not null,
  state varchar(20) not null,
  country varchar(20) not null,
  property_valuation smallint not null,
  customer_id int references customers(id)
)
"""
create_product_tbl_q = """CREATE TABLE product (
  id serial primary key,
  legacy_product_id int not null,
  brand varchar(15) not null,
  line varchar(10) not null,
  class varchar(10) not null,
  size varchar(10) not null,
  list_price money not null,
  standard_cost money not null
)
"""
create_transaction_tbl_q = """CREATE TABLE transaction (
  id serial primary key,
  date date,
  online_order bool,
  order_status varchar(10),
  product_id int references products(id),
  customer_id int references customers(id)
)
"""
create_queries = [create_customer_tbl_q, create_address_tbl_q, create_product_tbl_q, create_transaction_tbl_q]
with cnx.connect() as conn:
    for tbl_name in table_names:
        conn.execute(text(f"drop table if exists {tbl_name}"))
    for create_q in create_queries:
        conn.execute(text(create_q))
    conn.commit()

In [5]:
df = pd.read_excel("../data/customer_and_transaction__2_.xlsx", None)

In [6]:
customer_full_df = df['customer']
cust_df = customer_full_df[['customer_id', 'first_name', 'last_name', 'gender', 'DOB', 'job_title',
       'job_industry_category', 'wealth_segment', 'deceased_indicator', 'owns_car']]
addr_df = customer_full_df[['address', 'postcode', 'state', 'country', 'property_valuation', 'customer_id']]
addr_df

Unnamed: 0,address,postcode,state,country,property_valuation,customer_id
0,060 Morning Avenue,2016,New South Wales,Australia,10,1
1,6 Meadow Vale Court,2153,New South Wales,Australia,10,2
2,0 Holy Cross Court,4211,QLD,Australia,9,3
3,17979 Del Mar Point,2448,New South Wales,Australia,4,4
4,9 Oakridge Court,3216,VIC,Australia,9,5
...,...,...,...,...,...,...
3995,57042 Village Green Point,4511,QLD,Australia,6,3996
3996,87 Crescent Oaks Alley,2756,NSW,Australia,10,3997
3997,8194 Lien Street,4032,QLD,Australia,7,3998
3998,320 Acker Drive,2251,NSW,Australia,7,3999


In [7]:
cust_df.rename(columns={'customer_id': 'id', 'DOB': 'dob'}, inplace=True)
cust_df

Unnamed: 0,id,first_name,last_name,gender,dob,job_title,job_industry_category,wealth_segment,deceased_indicator,owns_car
0,1,Laraine,Medendorp,F,1953-10-12 00:00:00,Executive Secretary,Health,Mass Customer,N,Yes
1,2,Eli,Bockman,Male,1980-12-16 00:00:00,Administrative Officer,Financial Services,Mass Customer,N,Yes
2,3,Arlin,Dearle,Male,1954-01-20 00:00:00,Recruiting Manager,Property,Mass Customer,N,Yes
3,4,Talbot,,Male,1961-10-03 00:00:00,,IT,Mass Customer,N,No
4,5,Sheila-kathryn,Calton,Female,1977-05-13 00:00:00,Senior Editor,,Affluent Customer,N,Yes
...,...,...,...,...,...,...,...,...,...,...
3995,3996,Rosalia,Halgarth,Female,1975-08-09 00:00:00,VP Product Management,Health,Mass Customer,N,No
3996,3997,Blanch,Nisuis,Female,2001-07-13 00:00:00,Statistician II,Manufacturing,High Net Worth,N,Yes
3997,3998,Sarene,Woolley,U,,Assistant Manager,IT,High Net Worth,N,No
3998,3999,Patrizius,,Male,1973-10-24 00:00:00,,Manufacturing,Affluent Customer,N,Yes


In [8]:
cust_df.to_sql("customer", con=cnx, index=False, if_exists="append")
addr_df.to_sql("address", con=cnx, index=False, if_exists="append")

1000

In [9]:
transaction_full_df = df['transaction']
transaction_full_df.dropna(inplace=True)
# Подготовим данные для таблицы продукты
product_df = transaction_full_df[['product_id', 'brand', 'product_line', 'product_class', 'product_size', 'standard_cost', 'list_price']]
product_df = product_df.round(2)
product_df.drop_duplicates(inplace=True)
product_df.reset_index(inplace=True)
product_df['id'] = product_df.index + 1
product_df.drop(columns=["index"], inplace=True)
product_df

Unnamed: 0,product_id,brand,product_line,product_class,product_size,standard_cost,list_price,id
0,2,Solex,Standard,medium,medium,53.62,71.49,1
1,3,Trek Bicycles,Standard,medium,large,388.92,2091.47,2
2,37,OHM Cycles,Standard,low,medium,248.82,1793.43,3
3,88,Norco Bicycles,Standard,medium,medium,381.10,1198.46,4
4,78,Giant Bicycles,Standard,medium,large,709.48,1765.30,5
...,...,...,...,...,...,...,...,...
196,33,OHM Cycles,Road,medium,small,1610.90,1810.00,197
197,0,Solex,Standard,medium,large,151.96,202.62,198
198,70,Norco Bicycles,Road,medium,medium,206.35,1036.59,199
199,0,Solex,Standard,medium,medium,84.99,441.49,200


In [10]:
tx_df = transaction_full_df.merge(product_df[['product_id', 'brand', 'standard_cost', 'id']], on=['product_id', 'brand', 'standard_cost'], how='left')
tx_df = tx_df[tx_df['customer_id'] <= 4000] # убираем клиентов с несуществующими id
tx_df = tx_df[['transaction_id', 'customer_id', 'id', 'transaction_date', 'online_order', 'order_status']]
tx_df.rename(columns={'id': 'product_id'}, inplace=True)
tx_renamings = {
    'transaction_date': 'date',
    'transaction_id': 'id'
}
tx_df.rename(columns=tx_renamings, inplace=True)
tx_df['product_id'] = tx_df['product_id'].astype(np.int32)
tx_df.drop_duplicates(inplace=True)
tx_df

Unnamed: 0,id,customer_id,product_id,date,online_order,order_status
0,1,2950,1,2017-02-25,False,Approved
1,2,3120,2,2017-05-21,True,Approved
2,3,402,3,2017-10-16,False,Approved
3,4,3135,4,2017-08-31,False,Approved
4,5,787,5,2017-10-01,True,Approved
...,...,...,...,...,...,...
19440,19996,1018,121,2017-06-24,True,Approved
19441,19997,127,97,2017-11-09,True,Approved
19442,19998,2284,146,2017-04-14,True,Approved
19443,19999,2764,42,2017-07-03,False,Approved


In [11]:
product_renamings = {
    'product_id': 'legacy_product_id',
    'product_line': 'line',
    'product_class': 'class',
    'product_size': 'size'
}
product_df.rename(columns=product_renamings, inplace=True)
product_df.to_sql("product", con=cnx, index=False, if_exists="append")


tx_df.to_sql("transaction", con=cnx, index=False, if_exists="append")

442