# Data Loader

Script to read in CSV files and export to MySQL.

In [1]:
#Not all these dependencies are needed to load the data, but these are good for notebook analysis

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sqlalchemy import create_engine

pd.set_option('max_columns', 500)

In [2]:
def importer(path):
    '''Reads in and formats CSV accordingly'''
    df = pd.read_csv(path)
    
    return df

path = '../../stacyabramscampaigndec2018.csv'

campaign = importer(path)

In [3]:
campaign.head()

Unnamed: 0,FilerID,Type,LastName,FirstName,Address,City,State,Zip,PAC,Occupation,Employer,Date,Election,Election_Year,Cash_Amount,In_Kind_Amount,In_Kind_Description,Candidate_FirstName,Candidate_MiddleName,Candidate_LastName,Candidate_Suffix,Committee_Name
0,C2017000285,Monetary,Barlow,Allison,105 Berkeley Pl,Brooklyn,NY,11217-3786,,Program Director,Wallace Global Fund,10/31/2018 12:00:00 AM,General,2018,100.0,0.0,,Stacey,Yvonne,Abrams,,Stacy Abrams for Governor
1,C2017000285,Monetary,Barlow,Cathy,6130 Ardleigh St,Philadelphia,PA,19138-1520,,Attorney,FOCG LLC,11/8/2018 12:00:00 AM,General,2018,50.0,0.0,,Stacey,Yvonne,Abrams,,Stacy Abrams for Governor
2,C2017000285,Monetary,Barlow,Jesse,1427 S Pugh St,State College,PA,16801-6132,,Professor,Penn State,11/1/2018 12:00:00 AM,General,2018,50.0,0.0,,Stacey,Yvonne,Abrams,,Stacy Abrams for Governor
3,C2017000285,Monetary,Barnett,Elizabeth B.,1123 Narcisco St NE,Albuquerque,NM,87112-6656,,Not employed,noneNone,11/2/2018 12:00:00 AM,General,2018,3.0,0.0,,Stacey,Yvonne,Abrams,,Stacy Abrams for Governor
4,C2017000285,Monetary,Barrett,Nolen,346 29th Ave,San Francisco,CA,94121-1703,,Not Employed,Not Employed,10/27/2018 12:00:00 AM,General,2018,125.0,0.0,,Stacey,Yvonne,Abrams,,Stacy Abrams for Governor


In [5]:
def id_maker(df, col_name, identity_fields, base_number=1000000000):
    '''Read in DataFrame and a list of fields used to identify an entity. Assign a number to those fields.'''
    
    match_table = df[identity_fields].drop_duplicates().copy()
    match_table[col_name] = np.arange(base_number, base_number+len(match_table))
    
    df = pd.merge(df,
                  match_table,
                  on=identity_fields,
                  how='left')
    
    return df[col_name]



#id_check = id_maker(df = campaign, col_name = 'ContributorId', identity_fields = ['LastName', 'FirstName', 'Zip', 'Address'])

In [6]:
def contributor(df, key_col='ContributorId'):
    '''Parse and prep contributor info from the database. Contributor schema:
    
    ContributorId
    LastName
    FirstName
    Address1
    Address2
    City
    State
    Zip
    PAC
    Occupation
    Employer'''
    
    contributor = df.copy() #copy dataframe so as not to impact the data in the original location
    
    #Prepare dataframe to use same fields as in schema
    contributor_fields = ['ContributorId', 'LastName', 'FirstName', 'Address1', 'Address2',
                          'City', 'State', 'Zip', 'PAC', 'Occupation', 'Employer']
    
    contributor[key_col] = id_maker(df=contributor, 
                                    col_name=key_col, 
                                    identity_fields=['LastName', 'FirstName', 'Address', 'Zip'])
    
    contributor['Address1'] = contributor['Address']
    contributor['Address2'] = np.nan
    
    contributor = contributor[contributor_fields].drop_duplicates(subset=['ContributorId']).reset_index(drop=True)
    
    return contributor


In [7]:
contributors = contributor(campaign)

In [1]:
#Builds the key to connect with MySQL Database using sqlalchemy and pandas. password can be string of choice, make sure you preserve the punctuation in each string
file = open('getin.txt')
dialect = ''
driver = 'mysql://'
user = 'root:'
pwd = file.read()
host = '@127.0.0.1:'
port = '3306'
database = '/electionmoney'

key = dialect + driver + user + pwd + host + port + database
key

'mysql://root:my-secret-pw@127.0.0.1:3306/electionmoney'

In [2]:
#Establish the connection using the key

from sqlalchemy import create_engine
engine = create_engine(key)
connection = engine.connect()

OperationalError: (MySQLdb._exceptions.OperationalError) (2003, "Can't connect to MySQL server on '127.0.0.1' (111)")
(Background on this error at: http://sqlalche.me/e/e3q8)

In [None]:
contributors.to_sql('Contributor', connection, if_exists='append', index=False) #send the table, appending rows that already exist rather than deleting and replacing

In [None]:
pd.read_sql('Contributor', connection) #readout of the data from the SQL database