# Overview
In this notebook we will Extract data from a CSV, Transform the data, and then Load the data into SQL Server, all using python.

Our workflow will consist of the following steps:
1. Extract the data from csv
2. Transform the data
3. Load the data into SQL


# Step 1 - 2: Extract and Transform the CSV data
This code is copy pasted from the notebook about Data Preparation using Pandas

In [15]:
# Import the pandas library
import pandas

In [16]:
# Define a conversion function to (T) Transform our data
def convert_percent_string_to_float(raw_string):
    
    # The input variable will be a byte array
    # We will convert this to a string

    # We then do our manipulation
    raw_string = raw_string.strip()
    raw_string = raw_string.strip("%")

    # Make it a float
    input_float = float(raw_string)

    # We move the decimal place
    result = input_float/100

    return result

In [17]:
# Setup some other parameters to instruct the pandas function how and what we are importing
column_names = ("dates", "ints", "percents", "numbers")
test_data_string = "2019-04-08, 1, 2.3%, 45.\n2019-04-08, 6, 78.9%, 0"
delimiter = ","
converter_mapping = {
    "percents": convert_percent_string_to_float
}
column_types = "object,int,float,float"

# Import a module to help us import data
# This module implements a file-like class, StringIO, that reads and writes a string buffer
import io

# Create a file handle for our string data
test_data_file_handle = io.StringIO(test_data_string)

my_dataframe = pandas.read_csv(test_data_file_handle, names=column_names, dtype=column_types, parse_dates=["dates"], converters=converter_mapping)

my_dataframe



Unnamed: 0,dates,ints,percents,numbers
0,2019-04-08,1,0.023,45.0
1,2019-04-08,6,0.789,0.0


# 3. Load the data into SQL
We will use the ODBC driver and the python wrapper to connect our jupyter notebook to the MS SQL server. These need to be installed ahead of time.

In [2]:
# Import the pyodbc driver (this allows python to talk to SQL Server)
import pyodbc

In [3]:
# Show a list of available drivers that can be used with pyodbc
pyodbc.drivers()

['SQL Server',
 'SQL Server Native Client 11.0',
 'ODBC Driver 17 for SQL Server']

In [4]:
# Set some parameters to make our connection to the sql database
sql_server_name = 'my-server'
sql_database_name = 'Training'
sql_driver_specification = '{ODBC Driver 17 for SQL Server}' # Note: The SQL Server driver will not work for this db

In [5]:
# Connect to the database
sql_connection = pyodbc.connect(driver = sql_driver_specification,
                    server = sql_server_name,
                    database = sql_database_name,
                    trusted_connection='yes')
sql_connection

<pyodbc.Connection at 0x259e0c37c60>

In [43]:
# Create a table in our database which will hold our data
table_name = 'taylor_training'
sql_query = """
if not exists (select * from sysobjects where name='{0}' and xtype='U')
    CREATE TABLE {1} (
        dates date,
        ints int,
        percents float,
        numbers float
    )
"""
sql_query = sql_query.format(table_name, table_name)
print(sql_query)
cursor = sql_connection.cursor()
cursor.execute(sql_query)
sql_connection.commit()
print("Table Exists!")


if not exists (select * from sysobjects where name='taylor_training' and xtype='U')
    CREATE TABLE taylor_training (
        dates date,
        ints int,
        percents float,
        numbers float
    )

Table Exists!


In [52]:
# Write a template for a query we can reuse
sql_insert_query_template = """
INSERT INTO {0} (dates, ints, percents, numbers)
VALUES ('{1}', '{2}', '{3}', '{4}');
"""

# Loop through the rows, populate the template with values, execute the query
for index, row in my_dataframe.iterrows():
    sql_insert_query = sql_insert_query_template.format(table_name, row["dates"], row["ints"], row["percents"], row["numbers"])
    cursor.execute(sql_insert_query)
    sql_connection.commit()  

In [53]:
# Do a sanity check and look at what we have in our sql table
SQL_Query = pd.read_sql_query("select * from taylor_training", sql_connection)
results_dataframe =  pd.DataFrame(SQL_Query)
results_dataframe

Unnamed: 0,dates,ints,percents,numbers
0,2019-04-08,1,0.023,45.0
1,2019-04-08,1,0.023,45.0
2,2019-04-08,1,0.023,45.0
3,2019-04-08,1,0.023,45.0
4,2019-04-08,6,0.789,0.0
5,2019-04-08,1,0.023,45.0
6,2019-04-08,6,0.789,0.0
7,2019-04-08,1,0.023,45.0
8,2019-04-08,6,0.789,0.0
