In [1]:
#Setting dependencies

import numpy as np
import pandas as pd

import json
import requests
import time

import matplotlib.pyplot as plt
from pandas.plotting import table
from pprint import pprint
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

#Import the python driver for PostgreSQL
import psycopg2
from passwords import password

In [2]:
df = pd.read_csv('dass.csv')

In [3]:
df.head()

Unnamed: 0,country,Critical-quarrelsome,Dependable-self_disciplined,Anxious-easily upset,Open to new experiences-complex,Reserved-quiet,Sympathetic-warm,Disorganized-careless,Calm-emotionally_stable,Conventional-uncreative,...,major,Depression,Depression_cat,Anxiety,Anxiety_cat,Stress,Stress_cat,dep_cond,anx_cond,str_cond
0,IN,5,7,7,7,7,7,5,1,1,...,No Degree,27,3,34,4,40,4,Severe,Extremely Severe,Extremely Severe
1,US,5,4,7,5,4,7,7,1,5,...,No Degree,24,3,17,3,27,3,Severe,Severe,Severe
2,PL,5,2,2,5,6,5,5,3,2,...,No Degree,39,4,12,2,17,1,Extremely Severe,Moderate,Mild
3,US,1,7,4,6,4,6,1,6,1,...,Biology,16,2,17,3,16,1,Moderate,Severe,Mild
4,MY,5,3,6,5,5,5,6,3,3,...,Psychology,32,4,40,4,29,3,Extremely Severe,Extremely Severe,Severe


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39737 entries, 0 to 39736
Data columns (total 32 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   country                          39737 non-null  object 
 1   Critical-quarrelsome             39737 non-null  int64  
 2   Dependable-self_disciplined      39737 non-null  int64  
 3   Anxious-easily upset             39737 non-null  int64  
 4   Open to new experiences-complex  39737 non-null  int64  
 5   Reserved-quiet                   39737 non-null  int64  
 6   Sympathetic-warm                 39737 non-null  int64  
 7   Disorganized-careless            39737 non-null  int64  
 8   Calm-emotionally_stable          39737 non-null  int64  
 9   Conventional-uncreative          39737 non-null  int64  
 10  education                        39737 non-null  int64  
 11  urban                            39737 non-null  int64  
 12  gender            

familysize was left as a float - changing it to int to combine familysize of 3 (which was the mean inserted into the ones that had a value greater than 15)

In [5]:
df['familysize'].value_counts()

3.000000     9195
2.000000     9012
4.000000     7532
5.000000     4826
1.000000     4065
6.000000     2448
7.000000     1243
8.000000      675
9.000000      331
10.000000     194
11.000000     109
12.000000      56
13.000000      19
3.510381       18
14.000000       8
15.000000       6
Name: familysize, dtype: int64

In [6]:
df[["familysize"]] = df[["familysize"]].astype(int)
df['familysize'].value_counts()

3     9213
2     9012
4     7532
5     4826
1     4065
6     2448
7     1243
8      675
9      331
10     194
11     109
12      56
13      19
14       8
15       6
Name: familysize, dtype: int64

Create keys for each planned table and write each table out to a separate csv file.

In [7]:
class_labels = LabelEncoder()

In [8]:
df['age_grp_id'] = class_labels.fit_transform(df['age_group'].values)

In [9]:
df['person_info'] = df['gender'].astype(str) + df['age_grp_id'].astype(str) + df['race'].astype(str) + df['orientation'].astype(str) + df['hand'].astype(str) + df['married'].astype(str) + df['familysize'].astype(str) + df['engnat'].astype(str) + df['urban'].astype(str) + df['country'] +df['Depression'].astype(str) + df['Anxiety'].astype(str) + df['Stress'].astype(str) + df['education'].astype(str)+df['major'] + df['religion'].astype(str) + df['race'].astype(str) + df['voted'].astype(str) +df['Dependable-self_disciplined'].astype(str)+df['Anxious-easily upset'].astype(str)+df['Open to new experiences-complex'].astype(str) + df['Reserved-quiet'].astype(str)+df['Sympathetic-warm'].astype(str)
df['person_id'] = class_labels.fit_transform(df['person_info'].values)


In [10]:
df['person_id'].value_counts()

29077    1
9396     1
462      1
36646    1
34407    1
        ..
7266     1
35274    1
25810    1
1511     1
21183    1
Name: person_id, Length: 39737, dtype: int64

In [11]:
df['loc_info'] = df['urban'].astype(str) + df['country'] + df['person_info']
df['loc_id'] = class_labels.fit_transform(df['loc_info'].values)
df['dass_info'] = df['Depression'].astype(str) + df['Anxiety'].astype(str) + df['Stress'].astype(str) + df['person_info']
df['dass_id'] = class_labels.fit_transform(df['dass_info'])

Remove interim columns now that keys are created

In [12]:
df.drop(['person_info', 'loc_info', 'dass_info'], axis='columns', inplace=True)

In [13]:
person_df = df.filter(['person_id','gender','race', 'orientation', 'married', 'hand', 'familysize', 'engnat', 'age_grp_id'], axis=1)
age_grp_df = df.filter(['age_grp_id', 'age_group'], axis=1)
loc_info_df = df.filter(['loc_id', 'urban', 'country', 'person_id'], axis=1)
dass_info_df = df.filter(['dass_id', 'Depression', 'Depression_cat', 'Anxiety', 'Anxiety_cat', 'Stress', 'Stress_cat', 'person_id'], axis=1)

Check each one to make sure data looks good

In [14]:
person_df.head()

Unnamed: 0,person_id,gender,race,orientation,married,hand,familysize,engnat,age_grp_id
0,29077,2,10,1,1,1,2,2,7
1,39093,2,70,5,1,2,4,1,7
2,37286,2,60,3,1,1,3,2,7
3,39102,2,70,5,1,2,5,1,7
4,31768,2,10,1,1,3,4,2,7


In [15]:
person_df.value_counts()

person_id  gender  race  orientation  married  hand  familysize  engnat  age_grp_id
0          1       10    1            1        1     10          2       0             1
26494      2       60    1            1        1     1           1       3             1
26487      2       50    1            2        1     1           1       3             1
26488      2       50    1            2        1     4           1       3             1
26489      2       50    2            3        1     3           2       3             1
                                                                                      ..
13247      2       10    1            1        1     7           2       0             1
13248      2       10    1            1        1     7           2       0             1
13249      2       10    1            1        1     7           2       0             1
13250      2       10    1            1        1     7           2       0             1
39736      3       70    5

In [17]:
person_df.value_counts()

person_id  gender  race  orientation  married  hand  familysize  engnat  age_grp_id
0          1       10    1            1        1     10          2       0             1
26494      2       60    1            1        1     1           1       3             1
26487      2       50    1            2        1     1           1       3             1
26488      2       50    1            2        1     4           1       3             1
26489      2       50    2            3        1     3           2       3             1
                                                                                      ..
13247      2       10    1            1        1     7           2       0             1
13248      2       10    1            1        1     7           2       0             1
13249      2       10    1            1        1     7           2       0             1
13250      2       10    1            1        1     7           2       0             1
39736      3       70    5

In [18]:
age_grp_df.head()

Unnamed: 0,age_grp_id,age_group
0,7,Below 20
1,7,Below 20
2,7,Below 20
3,7,Below 20
4,7,Below 20


In [19]:
age_grp_df.value_counts()

age_grp_id  age_group
0           20-24        14766
7           Below 20     13820
1           25-29         5439
2           30-34         2194
4           40-49         1321
3           35-39         1138
5           50-59          764
6           Above 60       295
dtype: int64

There are duplicates in the age_grp_df since it is a domain table - will be dropping the duplicates

In [20]:
age_grp_df=age_grp_df.drop_duplicates()

In [21]:
age_grp_df.value_counts()

age_grp_id  age_group
0           20-24        1
1           25-29        1
2           30-34        1
3           35-39        1
4           40-49        1
5           50-59        1
6           Above 60     1
7           Below 20     1
dtype: int64

In [22]:
loc_info_df.head()

Unnamed: 0,loc_id,urban,country,person_id
0,24936,3,IN,29077
1,39537,3,US,39093
2,36175,3,PL,37286
3,39539,3,US,39102
4,17051,2,MY,31768


In [23]:
dass_info_df.head()

Unnamed: 0,dass_id,Depression,Depression_cat,Anxiety,Anxiety_cat,Stress,Stress_cat,person_id
0,19679,27,3,34,4,40,4,29077
1,16115,24,3,17,3,27,3,39093
2,30383,39,4,12,2,17,1,37286
3,7826,16,2,17,3,16,1,39102
4,24629,32,4,40,4,29,3,31768


Write each of the new dataframes to csv files.

In [24]:
person_df.to_csv("person.csv",index=False)
age_grp_df.to_csv("age_grp.csv",index=False)
loc_info_df.to_csv("loc_info.csv",index=False)
dass_info_df.to_csv("dass_info.csv",index=False)

![Data_Model_Final_Project.svg](attachment:Data_Model_Final_Project.svg)

Begin the database work - create tables and then load them.

# Create database

In [48]:
import psycopg2

from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT

 

# Connect to PostgreSQL DBMS

con = psycopg2.connect(user = "postgres", password = password);

con.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT);

 

# Obtain a DB Cursor

cursor          = con.cursor();

name_Database   = "dass";

 

# Create table statement

sqlCreateDatabase = "create database "+name_Database+";"

 

# Create a table in PostgreSQL database

cursor.execute(sqlCreateDatabase);

## Create tables

In [28]:
def create_tables():
    """ Use create commands to add tables to dass database in postgres"""
    
    commands = (
        """
        CREATE TABLE age_grp(
        age_grp_id SERIAL PRIMARY KEY,
        age_group CHAR(30))
        """,
        """
        CREATE TABLE person_info(
        person_id SERIAL PRIMARY KEY,
        gender INTEGER,
        race INTEGER,
        orientation INTEGER,
        married INTEGER,
        hand INTEGER,
        familysize INTEGER,
        engnat INTEGER,
        age_grp_id SERIAL REFERENCES age_grp(age_grp_id))
        """,
        """
        CREATE TABLE location_info(
        loc_id SERIAL PRIMARY KEY,
        urban INTEGER, 
        country CHAR(2),
        person_id SERIAL REFERENCES person_info(person_id))
        """,
        """
        CREATE TABLE dass_info(
        dass_id SERIAL PRIMARY KEY,
        dep_score INTEGER,
        dep_level INTEGER,
        anxiety_score INTEGER,
        anxiety_level INTEGER,
        stress_score INTEGER,
        stress_level INTEGER,
        person_id SERIAL REFERENCES person_info(person_id))
        """
        )
    
    
    #Create a connection credentials to the PostgreSQL database
    try:
        conn = psycopg2.connect(user = "postgres", password = password, database = "dass")
        print("Connection established")
        
        #Create a cursor connection object to a PostgreSQL instance 
        cur = conn.cursor()
        print("Cursor opened")
    
        for command in commands:
            cur.execute(command)
    
        print("Tables created")
    
        #Commit transaction and prints the result successfully
        conn.commit()
        print ("Commit successful")

    #Handle the error throws by the command that is useful when using python while working with PostgreSQL
    except(Exception, psycopg2.Error) as error:
        print("Error connecting to PostgreSQL database", error)
        conn = None

    #Close the database connection
    finally:
        if(conn != None):
            cur.close()
            conn.close()
            print("PostgreSQL cursor & connection is now closed")
            
if __name__ == '__main__':
    create_tables()

Connection established
Cursor opened
Tables created
Commit successful
PostgreSQL cursor & connection is now closed


## Load tables

In [29]:
def load_table_func(conn,csv_file,table_na):
    cur = conn.cursor()
    print("Cursor opened for processing csv ",csv_file)
    
    with open(csv_file,'r') as i:
        # Skip the header row
        next(i)
        
        # copy the table
        cur.copy_from(i,table_na,sep=',',null='')
    

In [30]:
conn = psycopg2.connect(user = "postgres", password = password, database = "dass")
print("Connection established")

load_table_func(conn,'age_grp.csv','age_grp')
load_table_func(conn,'person.csv','person_info')
load_table_func(conn,'loc_info.csv','location_info')
load_table_func(conn,'dass_info.csv','dass_info')
conn.commit()
conn.close()

Connection established
Cursor opened for processing csv  age_grp.csv
Cursor opened for processing csv  person.csv
Cursor opened for processing csv  loc_info.csv
Cursor opened for processing csv  dass_info.csv
