# Connecting to the database

In [49]:
import sqlite3

con = sqlite3.connect('cademycode.db')
cur = con.cursor()

# Loading into data frames

## Table analysis

In [50]:
## checking the tables in the database
tables = cur.execute('''SELECT * FROM sqlite_master''').fetchall()

for table in tables:
    print(table)

('table', 'cademycode_students', 'cademycode_students', 2, 'CREATE TABLE cademycode_students (\n\tuuid INTEGER, \n\tname VARCHAR, \n\tdob VARCHAR, \n\tsex TEXT, \n\tcontact_info JSON, \n\tjob_id VARCHAR, \n\tnum_course_taken VARCHAR, \n\tcurrent_career_path_id VARCHAR, \n\ttime_spent_hrs VARCHAR\n)')
('table', 'cademycode_courses', 'cademycode_courses', 5, 'CREATE TABLE cademycode_courses (\n\tcareer_path_id BIGINT, \n\tcareer_path_name TEXT, \n\thours_to_complete BIGINT\n)')
('table', 'cademycode_student_jobs', 'cademycode_student_jobs', 6, 'CREATE TABLE cademycode_student_jobs (\n\tjob_id BIGINT, \n\tjob_category TEXT, \n\tavg_salary BIGINT\n)')


In [51]:
## checking columns in the tables
for table in tables:
    table_name = table[1]
    columns = cur.execute(f'''PRAGMA table_info({table_name})''').fetchall()
    print(f'\n{table_name}')
    for column in columns:
        print(column)


cademycode_students
(0, 'uuid', 'INTEGER', 0, None, 0)
(1, 'name', 'VARCHAR', 0, None, 0)
(2, 'dob', 'VARCHAR', 0, None, 0)
(3, 'sex', 'TEXT', 0, None, 0)
(4, 'contact_info', 'JSON', 0, None, 0)
(5, 'job_id', 'VARCHAR', 0, None, 0)
(6, 'num_course_taken', 'VARCHAR', 0, None, 0)
(7, 'current_career_path_id', 'VARCHAR', 0, None, 0)
(8, 'time_spent_hrs', 'VARCHAR', 0, None, 0)

cademycode_courses
(0, 'career_path_id', 'BIGINT', 0, None, 0)
(1, 'career_path_name', 'TEXT', 0, None, 0)
(2, 'hours_to_complete', 'BIGINT', 0, None, 0)

cademycode_student_jobs
(0, 'job_id', 'BIGINT', 0, None, 0)
(1, 'job_category', 'TEXT', 0, None, 0)
(2, 'avg_salary', 'BIGINT', 0, None, 0)


## Loading

In [52]:
import pandas as pd

## reading the tables into pandas
cademycode_students = pd.read_sql_query('SELECT * FROM cademycode_students', con)
cademycode_courses = pd.read_sql_query('SELECT * FROM cademycode_courses', con)
cademycode_student_jobs = pd.read_sql_query('SELECT * FROM cademycode_student_jobs', con)

### Students

In [53]:
print('\n### top 5 rows ###')
print(cademycode_students.head())


### top 5 rows ###
   uuid             name         dob sex  \
0     1  Annabelle Avery  1943-07-03   F   
1     2      Micah Rubio  1991-02-07   M   
2     3       Hosea Dale  1989-12-07   M   
3     4     Mariann Kirk  1988-07-31   F   
4     5  Lucio Alexander  1963-08-31   M   

                                        contact_info job_id num_course_taken  \
0  {"mailing_address": "303 N Timber Key, Irondal...    7.0              6.0   
1  {"mailing_address": "767 Crescent Fair, Shoals...    7.0              5.0   
2  {"mailing_address": "P.O. Box 41269, St. Bonav...    7.0              8.0   
3  {"mailing_address": "517 SE Wintergreen Isle, ...    6.0              7.0   
4  {"mailing_address": "18 Cinder Cliff, Doyles b...    7.0             14.0   

  current_career_path_id time_spent_hrs  
0                    1.0           4.99  
1                    8.0            4.4  
2                    8.0           6.74  
3                    9.0          12.31  
4                    3.0

In [54]:
print('\n### info ###')
print(cademycode_students.info())


### info ###
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 9 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   uuid                    5000 non-null   int64 
 1   name                    5000 non-null   object
 2   dob                     5000 non-null   object
 3   sex                     5000 non-null   object
 4   contact_info            5000 non-null   object
 5   job_id                  4995 non-null   object
 6   num_course_taken        4749 non-null   object
 7   current_career_path_id  4529 non-null   object
 8   time_spent_hrs          4529 non-null   object
dtypes: int64(1), object(8)
memory usage: 351.7+ KB
None


In [55]:
cademycode_students = cademycode_students.drop_duplicates()

print('\n### info after update ###')
print(cademycode_students.info())


### info after update ###
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 9 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   uuid                    5000 non-null   int64 
 1   name                    5000 non-null   object
 2   dob                     5000 non-null   object
 3   sex                     5000 non-null   object
 4   contact_info            5000 non-null   object
 5   job_id                  4995 non-null   object
 6   num_course_taken        4749 non-null   object
 7   current_career_path_id  4529 non-null   object
 8   time_spent_hrs          4529 non-null   object
dtypes: int64(1), object(8)
memory usage: 351.7+ KB
None


In [56]:
print(cur.execute('''SELECT contact_info FROM cademycode_students''').fetchone())

('{"mailing_address": "303 N Timber Key, Irondale, Wisconsin, 84736", "email": "annabelle_avery9376@woohoo.com"}',)


In [57]:
print(cademycode_students['contact_info'].head())

0    {"mailing_address": "303 N Timber Key, Irondal...
1    {"mailing_address": "767 Crescent Fair, Shoals...
2    {"mailing_address": "P.O. Box 41269, St. Bonav...
3    {"mailing_address": "517 SE Wintergreen Isle, ...
4    {"mailing_address": "18 Cinder Cliff, Doyles b...
Name: contact_info, dtype: object


In [58]:
# TODO: split the contact_info column into two columns: mailing address and email
# {"mailing_address": "303 N Timber Key, Irondale, Wisconsin, 84736", "email": "annabelle_avery9376@woohoo.com"}
cademycode_students['mailing_address'] = cademycode_students['contact_info'].apply(lambda x: x.split('", ')[0])
cademycode_students['email'] = cademycode_students['contact_info'].apply(lambda x: x.split('", ')[1])

# drop the contact_info column
cademycode_students.drop('contact_info', axis=1, inplace=True)

# updated dataframe
print(cademycode_students.head())

   uuid             name         dob sex job_id num_course_taken  \
0     1  Annabelle Avery  1943-07-03   F    7.0              6.0   
1     2      Micah Rubio  1991-02-07   M    7.0              5.0   
2     3       Hosea Dale  1989-12-07   M    7.0              8.0   
3     4     Mariann Kirk  1988-07-31   F    6.0              7.0   
4     5  Lucio Alexander  1963-08-31   M    7.0             14.0   

  current_career_path_id time_spent_hrs  \
0                    1.0           4.99   
1                    8.0            4.4   
2                    8.0           6.74   
3                    9.0          12.31   
4                    3.0           5.64   

                                     mailing_address  \
0  {"mailing_address": "303 N Timber Key, Irondal...   
1  {"mailing_address": "767 Crescent Fair, Shoals...   
2  {"mailing_address": "P.O. Box 41269, St. Bonav...   
3  {"mailing_address": "517 SE Wintergreen Isle, ...   
4  {"mailing_address": "18 Cinder Cliff, Doyles b...

In [68]:
# clean up the new columns
cademycode_students['mailing_address'] = cademycode_students['mailing_address'].apply(lambda x: x.replace('{', '').replace('}', '').replace('"', '').replace('mailing_address:', ''))
cademycode_students['email'] = cademycode_students['email'].apply(lambda x: x.replace('{', '').replace('}', '').replace('"', '').replace('email:', ''))

print(cademycode_students.head())

   uuid             name         dob sex job_id num_course_taken  \
0     1  Annabelle Avery  1943-07-03   F    7.0              6.0   
1     2      Micah Rubio  1991-02-07   M    7.0              5.0   
2     3       Hosea Dale  1989-12-07   M    7.0              8.0   
3     4     Mariann Kirk  1988-07-31   F    6.0              7.0   
4     5  Lucio Alexander  1963-08-31   M    7.0             14.0   

  current_career_path_id time_spent_hrs  \
0                    1.0           4.99   
1                    8.0            4.4   
2                    8.0           6.74   
3                    9.0          12.31   
4                    3.0           5.64   

                                     mailing_address  \
0       303 N Timber Key, Irondale, Wisconsin, 84736   
1          767 Crescent Fair, Shoals, Indiana, 37439   
2   P.O. Box 41269, St. Bonaventure, Virginia, 83637   
3     517 SE Wintergreen Isle, Lane, Arkansas, 82242   
4   18 Cinder Cliff, Doyles borough, Rhode Island...

### Courses

In [60]:
print('\n### top 5 rows ###')
print(cademycode_courses.head())

print('\n### info ###')
print(cademycode_courses.info())


### top 5 rows ###
   career_path_id      career_path_name  hours_to_complete
0               1        data scientist                 20
1               2         data engineer                 20
2               3          data analyst                 12
3               4  software engineering                 25
4               5      backend engineer                 18

### info ###
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   career_path_id     10 non-null     int64 
 1   career_path_name   10 non-null     object
 2   hours_to_complete  10 non-null     int64 
dtypes: int64(2), object(1)
memory usage: 372.0+ bytes
None


In [61]:
cademycode_courses = cademycode_courses.drop_duplicates()

print('\n### info after update ###')
print(cademycode_courses.info())


### info after update ###
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   career_path_id     10 non-null     int64 
 1   career_path_name   10 non-null     object
 2   hours_to_complete  10 non-null     int64 
dtypes: int64(2), object(1)
memory usage: 372.0+ bytes
None


### Student Jobs

In [62]:
print('\n### top 5 rows ###')
print(cademycode_student_jobs.head())

print('\n### info ###')
print(cademycode_student_jobs.info())


### top 5 rows ###
   job_id        job_category  avg_salary
0       1           analytics       86000
1       2            engineer      101000
2       3  software developer      110000
3       4            creative       66000
4       5  financial services      135000

### info ###
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13 entries, 0 to 12
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   job_id        13 non-null     int64 
 1   job_category  13 non-null     object
 2   avg_salary    13 non-null     int64 
dtypes: int64(2), object(1)
memory usage: 444.0+ bytes
None


In [63]:
cademycode_student_jobs = cademycode_student_jobs.drop_duplicates()

print('\n### info after update ###')
print(cademycode_student_jobs.info())


### info after update ###
<class 'pandas.core.frame.DataFrame'>
Index: 10 entries, 0 to 9
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   job_id        10 non-null     int64 
 1   job_category  10 non-null     object
 2   avg_salary    10 non-null     int64 
dtypes: int64(2), object(1)
memory usage: 320.0+ bytes
None


# Output CSV

In [64]:
con2 = sqlite3.connect('cademycode_updated.db')
cur2 = con2.cursor()
