In [3]:
import pandas as pd
import mysql.connector
from sqlalchemy import create_engine

## Connect to the MySQL database

In [4]:
connection = mysql.connector.connect(host = 'localhost', user = 'root', password = 'root', port = 3306, database='mysql')
cursor = connection.cursor(buffered=True)
cursor.execute("SELECT VERSION()")
cursor.fetchone()

('8.3.0',)

## Create new schema and connect to it

In [5]:
# create new shcema 
cursor.execute("CREATE DATABASE IF NOT EXISTS testdb")

# connect to the schema
cursor.execute("USE testdb;")
engine = create_engine('mysql+mysqlconnector://root:root@localhost:3306/testdb')

In [6]:
# clean up the schema

cursor.execute('''DROP TABLE IF EXISTS BLS;''')
cursor.execute('''DROP TABLE IF EXISTS transaction;''')
cursor.execute('''DROP TABLE IF EXISTS customer;''')

## Read csv and excel files into Pandas DataFrames

In [7]:
customer = pd.read_csv('../data/CUSTOMER.csv')
transaction = pd.read_csv('../data/TRANSACTION.csv')
wage = pd.read_excel('../data/national_M2022_dl.xlsx', sheet_name='national_M2022_dl')

### Upload Customer data to the database

In [8]:
customer.head()

Unnamed: 0,Customer_ID,Gender,Age,Profession_Code,Work_Experience,Family_Size
0,3991,Male,21,53-0000,3,4
1,10634,Female,29,25-3031,5,2
2,6726,Male,24,41-0000,3,3
3,5542,Female,36,15-1244,9,3
4,5740,Female,25,53-7000,1,2


In [9]:
customer.tail()

Unnamed: 0,Customer_ID,Gender,Age,Profession_Code,Work_Experience,Family_Size
9995,10089,Male,20,43-4051,4,3
9996,10139,Male,39,27-0000,10,3
9997,2565,Female,32,31-1131,7,6
9998,9262,Female,46,41-4010,16,2
9999,8633,Female,36,11-9000,9,4


In [10]:
# There is no duplicate customer in the dataset
customer.duplicated(subset=['Customer_ID']).sum()

0

In [11]:
# Check for missing values
customer.isnull().sum()

Customer_ID        0
Gender             0
Age                0
Profession_Code    0
Work_Experience    0
Family_Size        0
dtype: int64

In [12]:
# Upload the data to the database and set the primary key
customer.to_sql('customer', con=engine, if_exists='replace', index=False, schema='testdb')
cursor.execute('alter table customer add primary key (customer_id);')

In [13]:
# Top 5 rows of customer table
cursor.execute('SELECT * FROM customer LIMIT 5;')
cursor.fetchall()

[(1000, 'Female', 30, '35-2010', 3, 4),
 (1001, 'Female', 46, '43-5060', 9, 3),
 (1002, 'Male', 28, '53-7062', 6, 5),
 (1003, 'Female', 18, '31-1100', 2, 2),
 (1004, 'Female', 16, '51-3091', 1, 2)]

In [14]:
# Bottom 5 rows of customer table
cursor.execute('SELECT * FROM customer ORDER BY customer_id desc LIMIT 5;')
cursor.fetchall()

[(10999, 'Female', 43, '13-2010', 21, 4),
 (10998, 'Female', 48, '15-1252', 21, 4),
 (10997, 'Female', 23, '41-2000', 5, 1),
 (10996, 'Male', 21, '41-0000', 0, 5),
 (10995, 'Male', 32, '53-7062', 6, 3)]

### Upload transaction data to the database

In [15]:
transaction

Unnamed: 0,Customer_ID,Timestamp,Amount,Transaction_Type
0,10061,2023-09-28 01:33:49,64950.477598,Deposit
1,9183,2023-12-30 17:29:07,694.634030,Withdrawal
2,1442,2023-10-24 17:58:40,37023.330210,Deposit
3,7806,2024-02-11 01:06:14,28538.342907,Deposit
4,9932,2024-01-12 11:42:00,1788.829725,Withdrawal
...,...,...,...,...
99995,4947,2023-09-17 02:43:31,31369.274354,Deposit
99996,4220,2023-10-31 22:11:25,51.839492,Card
99997,1431,2023-09-09 23:09:01,16.377269,Card
99998,1120,2023-12-18 13:32:39,21.086196,Card


In [16]:
# Checking for duplicates
transaction.duplicated().sum()

0

In [17]:
# Checking for missing values
transaction.isnull().sum()

Customer_ID         0
Timestamp           0
Amount              0
Transaction_Type    0
dtype: int64

In [18]:
# Set index as txn_id
transaction.reset_index(inplace=True)
transaction.rename(columns = {'index':'txn_id'}, inplace = True)
transaction.head()

Unnamed: 0,txn_id,Customer_ID,Timestamp,Amount,Transaction_Type
0,0,10061,2023-09-28 01:33:49,64950.477598,Deposit
1,1,9183,2023-12-30 17:29:07,694.63403,Withdrawal
2,2,1442,2023-10-24 17:58:40,37023.33021,Deposit
3,3,7806,2024-02-11 01:06:14,28538.342907,Deposit
4,4,9932,2024-01-12 11:42:00,1788.829725,Withdrawal


In [19]:
# Upload the data to the database and set the primary key and foreign key
transaction.to_sql('transaction', con=engine, if_exists='replace', index=False, schema='testdb')
cursor.execute('alter table transaction add primary key (txn_id);')
cursor.execute('alter table transaction add foreign key (Customer_ID) references customer(customer_id);')

In [20]:
# Top 5 rows of transaction table
cursor.execute('select * from transaction limit 5;')
cursor.fetchall()

[(0, 10061, '2023-09-28 01:33:49', 64950.47759795844, 'Deposit'),
 (1, 9183, '2023-12-30 17:29:07', 694.6340299521144, 'Withdrawal'),
 (2, 1442, '2023-10-24 17:58:40', 37023.33021014404, 'Deposit'),
 (3, 7806, '2024-02-11 01:06:14', 28538.342906671933, 'Deposit'),
 (4, 9932, '2024-01-12 11:42:00', 1788.8297246399188, 'Withdrawal')]

In [21]:
# Bottom 5 rows of transaction table
cursor.execute('select * from transaction order by txn_id desc limit 5;')
cursor.fetchall()

[(99999, 6194, '2023-12-19 18:08:29', 520.7688525560133, 'Withdrawal'),
 (99998, 1120, '2023-12-18 13:32:39', 21.086196293901125, 'Card'),
 (99997, 1431, '2023-09-09 23:09:01', 16.377268752997804, 'Card'),
 (99996, 4220, '2023-10-31 22:11:25', 51.83949247905126, 'Card'),
 (99995, 4947, '2023-09-17 02:43:31', 31369.27435359539, 'Deposit')]

## Upload BLS data to the database

In [22]:
wage

Unnamed: 0,AREA,AREA_TITLE,AREA_TYPE,PRIM_STATE,NAICS,NAICS_TITLE,I_GROUP,OWN_CODE,OCC_CODE,OCC_TITLE,...,H_MEDIAN,H_PCT75,H_PCT90,A_PCT10,A_PCT25,A_MEDIAN,A_PCT75,A_PCT90,ANNUAL,HOURLY
0,99,U.S.,1,US,0,Cross-industry,cross-industry,1235,00-0000,All Occupations,...,22.26,35.32,53.03,27340,33330,46310,73460,110290,,
1,99,U.S.,1,US,0,Cross-industry,cross-industry,1235,11-0000,Management Occupations,...,51.62,78.71,106.03,50290,75350,107360,163710,220550,,
2,99,U.S.,1,US,0,Cross-industry,cross-industry,1235,11-1000,Top Executives,...,48.02,76.96,#,43440,62520,99890,160070,#,,
3,99,U.S.,1,US,0,Cross-industry,cross-industry,1235,11-1010,Chief Executives,...,91.12,#,#,74920,122480,189520,#,#,,
4,99,U.S.,1,US,0,Cross-industry,cross-industry,1235,11-1011,Chief Executives,...,91.12,#,#,74920,122480,189520,#,#,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1397,99,U.S.,1,US,0,Cross-industry,cross-industry,1235,53-7081,Refuse and Recyclable Material Collectors,...,20.94,25.87,30.96,28190,34040,43540,53800,64390,,
1398,99,U.S.,1,US,0,Cross-industry,cross-industry,1235,53-7120,"Tank Car, Truck, and Ship Loaders",...,25.93,36.38,42.62,36730,44500,53930,75670,88650,,
1399,99,U.S.,1,US,0,Cross-industry,cross-industry,1235,53-7121,"Tank Car, Truck, and Ship Loaders",...,25.93,36.38,42.62,36730,44500,53930,75670,88650,,
1400,99,U.S.,1,US,0,Cross-industry,cross-industry,1235,53-7190,Miscellaneous Material Moving Workers,...,18.65,23.48,28.88,30070,33280,38800,48840,60070,,


Notes:
- \*  = indicates that a wage estimate is not available
- \*\*  = indicates that an employment estimate is not available
- \#  = indicates a wage equal to or greater than $115.00 per hour or $239,200 per year 

In [23]:
print(wage.columns[wage.isin(['*']).any()].tolist())
print(wage.columns[wage.isin(['**']).any()].tolist())

['H_MEAN', 'A_MEAN', 'H_PCT10', 'H_PCT25', 'H_MEDIAN', 'H_PCT75', 'H_PCT90', 'A_PCT10', 'A_PCT25', 'A_MEDIAN', 'A_PCT75', 'A_PCT90']
[]


In [24]:
wage.replace('*', 'NaN', inplace=True)
wage.replace('**', 'NaN', inplace=True)

In [25]:
# find all columns value conatin '#'
wage.columns[wage.isin(['#']).any()].tolist()

['H_PCT25',
 'H_MEDIAN',
 'H_PCT75',
 'H_PCT90',
 'A_PCT25',
 'A_MEDIAN',
 'A_PCT75',
 'A_PCT90']

* h_pct25 - Hourly 25th percentile wage
* h_median - Hourly median wage (or the 50th percentile)
* h_pct75 - Hourly 75th percentile wage
* h_pct90 - Hourly 90th percentile wage
* a_pct25 - Annual 25th percentile wage
* a_median - Annual median wage (or the 50th percentile)
* a_pct75 - Annual 75th percentile wage
* a_pct90 - Annual 90th percentile wage

In [26]:
# for column start with 'H_' replace '#' with '115.00' and make sure the column is numeric
wage.loc[:, wage.columns.str.startswith('H_')] = wage.loc[:, wage.columns.str.startswith('H_')].replace('#', '115.00')
# for column start with 'A_' replace '#' with '239200'
wage.loc[:, wage.columns.str.startswith('A_')] = wage.loc[:, wage.columns.str.startswith('A_')].replace('#', '239200')

* annual - Contains "TRUE" if only annual wages are released. The OEWS program releases only annual wages for some occupations that typically work fewer than 2,080 hours per year, but are paid on an annual basis, such as teachers, pilots, and athletes.
* hourly - Contains "TRUE" if only hourly wages are released. The OEWS program releases only hourly wages for some occupations that typically work fewer than 2,080 hours per year and are paid on an hourly basis, such as actors, dancers, and musicians and singers.

In [27]:
# replace nan with 0 for annual and hourly wage flag
wage['ANNUAL'] = wage['ANNUAL'].replace('NaN', 0)
wage['HOURLY'] = wage['HOURLY'].replace('NaN', 0)

In [28]:
wage

Unnamed: 0,AREA,AREA_TITLE,AREA_TYPE,PRIM_STATE,NAICS,NAICS_TITLE,I_GROUP,OWN_CODE,OCC_CODE,OCC_TITLE,...,H_MEDIAN,H_PCT75,H_PCT90,A_PCT10,A_PCT25,A_MEDIAN,A_PCT75,A_PCT90,ANNUAL,HOURLY
0,99,U.S.,1,US,0,Cross-industry,cross-industry,1235,00-0000,All Occupations,...,22.26,35.32,53.03,27340,33330,46310,73460,110290,,
1,99,U.S.,1,US,0,Cross-industry,cross-industry,1235,11-0000,Management Occupations,...,51.62,78.71,106.03,50290,75350,107360,163710,220550,,
2,99,U.S.,1,US,0,Cross-industry,cross-industry,1235,11-1000,Top Executives,...,48.02,76.96,115.00,43440,62520,99890,160070,239200,,
3,99,U.S.,1,US,0,Cross-industry,cross-industry,1235,11-1010,Chief Executives,...,91.12,115.00,115.00,74920,122480,189520,239200,239200,,
4,99,U.S.,1,US,0,Cross-industry,cross-industry,1235,11-1011,Chief Executives,...,91.12,115.00,115.00,74920,122480,189520,239200,239200,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1397,99,U.S.,1,US,0,Cross-industry,cross-industry,1235,53-7081,Refuse and Recyclable Material Collectors,...,20.94,25.87,30.96,28190,34040,43540,53800,64390,,
1398,99,U.S.,1,US,0,Cross-industry,cross-industry,1235,53-7120,"Tank Car, Truck, and Ship Loaders",...,25.93,36.38,42.62,36730,44500,53930,75670,88650,,
1399,99,U.S.,1,US,0,Cross-industry,cross-industry,1235,53-7121,"Tank Car, Truck, and Ship Loaders",...,25.93,36.38,42.62,36730,44500,53930,75670,88650,,
1400,99,U.S.,1,US,0,Cross-industry,cross-industry,1235,53-7190,Miscellaneous Material Moving Workers,...,18.65,23.48,28.88,30070,33280,38800,48840,60070,,


In [29]:
# Set index as wage_id for creating primary key
wage.reset_index(inplace=True)
wage.rename(columns = {'index':'wage_id'}, inplace = True)
wage.head()

Unnamed: 0,wage_id,AREA,AREA_TITLE,AREA_TYPE,PRIM_STATE,NAICS,NAICS_TITLE,I_GROUP,OWN_CODE,OCC_CODE,...,H_MEDIAN,H_PCT75,H_PCT90,A_PCT10,A_PCT25,A_MEDIAN,A_PCT75,A_PCT90,ANNUAL,HOURLY
0,0,99,U.S.,1,US,0,Cross-industry,cross-industry,1235,00-0000,...,22.26,35.32,53.03,27340,33330,46310,73460,110290,,
1,1,99,U.S.,1,US,0,Cross-industry,cross-industry,1235,11-0000,...,51.62,78.71,106.03,50290,75350,107360,163710,220550,,
2,2,99,U.S.,1,US,0,Cross-industry,cross-industry,1235,11-1000,...,48.02,76.96,115.0,43440,62520,99890,160070,239200,,
3,3,99,U.S.,1,US,0,Cross-industry,cross-industry,1235,11-1010,...,91.12,115.0,115.0,74920,122480,189520,239200,239200,,
4,4,99,U.S.,1,US,0,Cross-industry,cross-industry,1235,11-1011,...,91.12,115.0,115.0,74920,122480,189520,239200,239200,,


#### There exist duplicate for OCC_CODE in the BLS data. After checking the data, I found that for two duplicate OCC_CODE, the only difference is the O_GROUP, which is the SOC occupation level. From description, I decided to keep the one with the O_GROUP as 'major' and drop the other one.

In [30]:
# checking for duplicates after dropping wage_id
wage[wage.duplicated(subset=list(wage.columns.drop('wage_id')))]

Unnamed: 0,wage_id,AREA,AREA_TITLE,AREA_TYPE,PRIM_STATE,NAICS,NAICS_TITLE,I_GROUP,OWN_CODE,OCC_CODE,...,H_MEDIAN,H_PCT75,H_PCT90,A_PCT10,A_PCT25,A_MEDIAN,A_PCT75,A_PCT90,ANNUAL,HOURLY


In [31]:
# checking for duplicates after drop O_GROUP and wage_id columns
wage[wage.duplicated(subset=list(wage.columns.drop('O_GROUP').drop('wage_id')), keep=False)]

Unnamed: 0,wage_id,AREA,AREA_TITLE,AREA_TYPE,PRIM_STATE,NAICS,NAICS_TITLE,I_GROUP,OWN_CODE,OCC_CODE,...,H_MEDIAN,H_PCT75,H_PCT90,A_PCT10,A_PCT25,A_MEDIAN,A_PCT75,A_PCT90,ANNUAL,HOURLY
78,78,99,U.S.,1,US,0,Cross-industry,cross-industry,1235,13-1020,...,32.51,43.71,55.95,41060,51820,67620,90920,116370,,
79,79,99,U.S.,1,US,0,Cross-industry,cross-industry,1235,13-1020,...,32.51,43.71,55.95,41060,51820,67620,90920,116370,,
111,111,99,U.S.,1,US,0,Cross-industry,cross-industry,1235,13-2020,...,29.6,39.88,56.54,35700,46530,61560,82950,117600,,
112,112,99,U.S.,1,US,0,Cross-industry,cross-industry,1235,13-2020,...,29.6,39.88,56.54,35700,46530,61560,82950,117600,,
572,572,99,U.S.,1,US,0,Cross-industry,cross-industry,1235,29-2010,...,27.59,36.02,40.71,35220,40440,57380,74920,84670,,
573,573,99,U.S.,1,US,0,Cross-industry,cross-industry,1235,29-2010,...,27.59,36.02,40.71,35220,40440,57380,74920,84670,,
611,611,99,U.S.,1,US,0,Cross-industry,cross-industry,1235,31-1120,...,14.51,16.4,18.44,22500,27100,30180,34110,38350,,
612,612,99,U.S.,1,US,0,Cross-industry,cross-industry,1235,31-1120,...,14.51,16.4,18.44,22500,27100,30180,34110,38350,,
778,778,99,U.S.,1,US,0,Cross-industry,cross-industry,1235,39-7010,...,16.56,20.61,26.51,23400,28220,34440,42870,55130,,
779,779,99,U.S.,1,US,0,Cross-industry,cross-industry,1235,39-7010,...,16.56,20.61,26.51,23400,28220,34440,42870,55130,,


In [32]:
wage[wage.duplicated(subset=['OCC_CODE'], keep=False)]

Unnamed: 0,wage_id,AREA,AREA_TITLE,AREA_TYPE,PRIM_STATE,NAICS,NAICS_TITLE,I_GROUP,OWN_CODE,OCC_CODE,...,H_MEDIAN,H_PCT75,H_PCT90,A_PCT10,A_PCT25,A_MEDIAN,A_PCT75,A_PCT90,ANNUAL,HOURLY
78,78,99,U.S.,1,US,0,Cross-industry,cross-industry,1235,13-1020,...,32.51,43.71,55.95,41060,51820,67620,90920,116370,,
79,79,99,U.S.,1,US,0,Cross-industry,cross-industry,1235,13-1020,...,32.51,43.71,55.95,41060,51820,67620,90920,116370,,
111,111,99,U.S.,1,US,0,Cross-industry,cross-industry,1235,13-2020,...,29.6,39.88,56.54,35700,46530,61560,82950,117600,,
112,112,99,U.S.,1,US,0,Cross-industry,cross-industry,1235,13-2020,...,29.6,39.88,56.54,35700,46530,61560,82950,117600,,
572,572,99,U.S.,1,US,0,Cross-industry,cross-industry,1235,29-2010,...,27.59,36.02,40.71,35220,40440,57380,74920,84670,,
573,573,99,U.S.,1,US,0,Cross-industry,cross-industry,1235,29-2010,...,27.59,36.02,40.71,35220,40440,57380,74920,84670,,
611,611,99,U.S.,1,US,0,Cross-industry,cross-industry,1235,31-1120,...,14.51,16.4,18.44,22500,27100,30180,34110,38350,,
612,612,99,U.S.,1,US,0,Cross-industry,cross-industry,1235,31-1120,...,14.51,16.4,18.44,22500,27100,30180,34110,38350,,
778,778,99,U.S.,1,US,0,Cross-industry,cross-industry,1235,39-7010,...,16.56,20.61,26.51,23400,28220,34440,42870,55130,,
779,779,99,U.S.,1,US,0,Cross-industry,cross-industry,1235,39-7010,...,16.56,20.61,26.51,23400,28220,34440,42870,55130,,


In [33]:
# drop duplicates by OCC_CODE
wage.drop_duplicates(subset=['OCC_CODE'], inplace=True)
wage[wage.duplicated(subset=['OCC_CODE'])]

Unnamed: 0,wage_id,AREA,AREA_TITLE,AREA_TYPE,PRIM_STATE,NAICS,NAICS_TITLE,I_GROUP,OWN_CODE,OCC_CODE,...,H_MEDIAN,H_PCT75,H_PCT90,A_PCT10,A_PCT25,A_MEDIAN,A_PCT75,A_PCT90,ANNUAL,HOURLY


In [34]:
# Upload the data to the database and set the primary key
wage.to_sql('BLS', con=engine, if_exists='replace', index=False, schema='testdb')
cursor.execute('alter table BLS add primary key (wage_id);')

In [35]:
# Top 5 rows of BLS table
cursor.execute('select * from BLS limit 5;')
pd.DataFrame(cursor.fetchall(), columns=[x[0] for x in cursor.description])

Unnamed: 0,wage_id,AREA,AREA_TITLE,AREA_TYPE,PRIM_STATE,NAICS,NAICS_TITLE,I_GROUP,OWN_CODE,OCC_CODE,...,H_MEDIAN,H_PCT75,H_PCT90,A_PCT10,A_PCT25,A_MEDIAN,A_PCT75,A_PCT90,ANNUAL,HOURLY
0,0,99,U.S.,1,US,0,Cross-industry,cross-industry,1235,00-0000,...,22.26,35.32,53.03,27340,33330,46310,73460,110290,,
1,1,99,U.S.,1,US,0,Cross-industry,cross-industry,1235,11-0000,...,51.62,78.71,106.03,50290,75350,107360,163710,220550,,
2,2,99,U.S.,1,US,0,Cross-industry,cross-industry,1235,11-1000,...,48.02,76.96,115.0,43440,62520,99890,160070,239200,,
3,3,99,U.S.,1,US,0,Cross-industry,cross-industry,1235,11-1010,...,91.12,115.0,115.0,74920,122480,189520,239200,239200,,
4,4,99,U.S.,1,US,0,Cross-industry,cross-industry,1235,11-1011,...,91.12,115.0,115.0,74920,122480,189520,239200,239200,,


In [36]:
# Bottom 5 rows of BLS table
cursor.execute('select * from BLS order by wage_id desc limit 5;')
pd.DataFrame(cursor.fetchall(), columns=[x[0] for x in cursor.description])

Unnamed: 0,wage_id,AREA,AREA_TITLE,AREA_TYPE,PRIM_STATE,NAICS,NAICS_TITLE,I_GROUP,OWN_CODE,OCC_CODE,...,H_MEDIAN,H_PCT75,H_PCT90,A_PCT10,A_PCT25,A_MEDIAN,A_PCT75,A_PCT90,ANNUAL,HOURLY
0,1401,99,U.S.,1,US,0,Cross-industry,cross-industry,1235,53-7199,...,18.65,23.48,28.88,30070,33280,38800,48840,60070,,
1,1400,99,U.S.,1,US,0,Cross-industry,cross-industry,1235,53-7190,...,18.65,23.48,28.88,30070,33280,38800,48840,60070,,
2,1399,99,U.S.,1,US,0,Cross-industry,cross-industry,1235,53-7121,...,25.93,36.38,42.62,36730,44500,53930,75670,88650,,
3,1398,99,U.S.,1,US,0,Cross-industry,cross-industry,1235,53-7120,...,25.93,36.38,42.62,36730,44500,53930,75670,88650,,
4,1397,99,U.S.,1,US,0,Cross-industry,cross-industry,1235,53-7081,...,20.94,25.87,30.96,28190,34040,43540,53800,64390,,


## Obtain the annual median wage and ingest into customer table in the database

In [37]:
# alter customer table to add median annual wage from BLS table (join on OCC_CODE)
cursor.execute('alter table customer add column median_annual_wage TEXT;')
cursor.execute('update customer c join BLS b on c.profession_code = b.OCC_CODE set c.median_annual_wage = b.A_MEDIAN;')

In [38]:
# Top 5 rows of customer table
cursor.execute('select * from customer limit 5;')
cursor.fetchall()

[(1000, 'Female', 30, '35-2010', 3, 4, '30910'),
 (1001, 'Female', 46, '43-5060', 9, 3, '50630'),
 (1002, 'Male', 28, '53-7062', 6, 5, '36110'),
 (1003, 'Female', 18, '31-1100', 2, 2, '31450'),
 (1004, 'Female', 16, '51-3091', 1, 2, '37790')]

In [39]:
# Bottom 5 rows of customer table
cursor.execute('select * from customer order by customer_id desc limit 5;')
cursor.fetchall()

[(10999, 'Female', 43, '13-2010', 21, 4, '78000'),
 (10998, 'Female', 48, '15-1252', 21, 4, '127260'),
 (10997, 'Female', 23, '41-2000', 5, 1, '29660'),
 (10996, 'Male', 21, '41-0000', 0, 5, '35290'),
 (10995, 'Male', 32, '53-7062', 6, 3, '36110')]

In [40]:
# validate the data
pd.merge(customer, wage[['OCC_CODE', 'A_MEDIAN']], left_on='Profession_Code', right_on='OCC_CODE', how='inner')

Unnamed: 0,Customer_ID,Gender,Age,Profession_Code,Work_Experience,Family_Size,OCC_CODE,A_MEDIAN
0,3991,Male,21,53-0000,3,4,53-0000,37940
1,10634,Female,29,25-3031,5,2,25-3031,35250
2,6726,Male,24,41-0000,3,3,41-0000,35290
3,5542,Female,36,15-1244,9,3,15-1244,90520
4,5740,Female,25,53-7000,1,2,53-7000,35670
...,...,...,...,...,...,...,...,...
9995,10089,Male,20,43-4051,4,3,43-4051,37780
9996,10139,Male,39,27-0000,10,3,27-0000,58030
9997,2565,Female,32,31-1131,7,6,31-1131,35760
9998,9262,Female,46,41-4010,16,2,41-4010,67750


### Close the connection to the database

In [41]:
connection.commit()
cursor.close()
connection.close()