In [1]:
import sys
import os
import findspark
findspark.init()

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable




In [2]:
import spark
import pyspark


In [3]:
from pyspark import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.sql.types import *
from pyspark.sql import functions as F
from pyspark.sql.functions import *



In [4]:
spark = SparkSession.builder.appName('capstone_project').getOrCreate()


In [5]:
#read the database username and password from secret.txt
secrets_file = os.path.join("files", "secret.txt")
with open(secrets_file, "r") as f:		
    lines = f.readlines()
for line in lines:
    words = line.split("=")
    if (words[0] == "user"):
        user = words[1].strip()
    elif (words[0] == "password"):
        password = words[1].strip()
#print(user, password)

In [6]:
# creating creditcard datafram from the json file
cdw_sapp_credit = os.path.join("files", "cdw_sapp_credit.json")

df_creditcard = spark.read.json(cdw_sapp_credit)



In [7]:

type(df_creditcard)
df_creditcard.show()

+-----------+----------------+---------+---+-----+--------------+----------------+-----------------+----+
|BRANCH_CODE|  CREDIT_CARD_NO| CUST_SSN|DAY|MONTH|TRANSACTION_ID|TRANSACTION_TYPE|TRANSACTION_VALUE|YEAR|
+-----------+----------------+---------+---+-----+--------------+----------------+-----------------+----+
|        114|4210653349028689|123459988| 14|    2|             1|       Education|             78.9|2018|
|         35|4210653349028689|123459988| 20|    3|             2|   Entertainment|            14.24|2018|
|        160|4210653349028689|123459988|  8|    7|             3|         Grocery|             56.7|2018|
|        114|4210653349028689|123459988| 19|    4|             4|   Entertainment|            59.73|2018|
|         93|4210653349028689|123459988| 10|   10|             5|             Gas|             3.59|2018|
|        164|4210653349028689|123459988| 28|    5|             6|       Education|             6.89|2018|
|        119|4210653349028689|123459988| 19|  

In [8]:
df_creditcard.printSchema()

root
 |-- BRANCH_CODE: long (nullable = true)
 |-- CREDIT_CARD_NO: string (nullable = true)
 |-- CUST_SSN: long (nullable = true)
 |-- DAY: long (nullable = true)
 |-- MONTH: long (nullable = true)
 |-- TRANSACTION_ID: long (nullable = true)
 |-- TRANSACTION_TYPE: string (nullable = true)
 |-- TRANSACTION_VALUE: double (nullable = true)
 |-- YEAR: long (nullable = true)



In [9]:
    
df_creditcard.createOrReplaceTempView('credit_data')

In [10]:
creditcard_df = spark.sql('SELECT CREDIT_CARD_NO CUST_CC_NO,\
         CONCAT(YEAR, LPAD(Month, 2, 0), \
         LPAD(Day, 2, 0)) TIMEID, \
         CUST_SSN,\
         BRANCH_CODE,\
         TRANSACTION_TYPE,\
         TRANSACTION_VALUE,\
         TRANSACTION_ID \
 FROM credit_data')

# credit_tr = spark.sql('SELECT CREDIT_CARD_NO CUST_CC_NO,\
#          CONCAT(YEAR, LPAD(Month, 2, 0), \
#          LPAD(Day, 2, 0)) TIMEID, \
#          CUST_SSN,\
#          BRANCH_CODE,\
#          TRANSACTION_TYPE,\
#          TRANSACTION_VALUE,\
#          TRANSACTION_ID \
#  FROM cdw_sapp_creditcard')

creditcard_df.show()


+----------------+--------+---------+-----------+----------------+-----------------+--------------+
|      CUST_CC_NO|  TIMEID| CUST_SSN|BRANCH_CODE|TRANSACTION_TYPE|TRANSACTION_VALUE|TRANSACTION_ID|
+----------------+--------+---------+-----------+----------------+-----------------+--------------+
|4210653349028689|20180214|123459988|        114|       Education|             78.9|             1|
|4210653349028689|20180320|123459988|         35|   Entertainment|            14.24|             2|
|4210653349028689|20180708|123459988|        160|         Grocery|             56.7|             3|
|4210653349028689|20180419|123459988|        114|   Entertainment|            59.73|             4|
|4210653349028689|20181010|123459988|         93|             Gas|             3.59|             5|
|4210653349028689|20180528|123459988|        164|       Education|             6.89|             6|
|4210653349028689|20180519|123459988|        119|   Entertainment|            43.39|             7|


In [11]:
creditcard_df.createOrReplaceTempView('credit_df')

In [12]:
creditcard_df.printSchema()

root
 |-- CUST_CC_NO: string (nullable = true)
 |-- TIMEID: string (nullable = true)
 |-- CUST_SSN: long (nullable = true)
 |-- BRANCH_CODE: long (nullable = true)
 |-- TRANSACTION_TYPE: string (nullable = true)
 |-- TRANSACTION_VALUE: double (nullable = true)
 |-- TRANSACTION_ID: long (nullable = true)



In [13]:
creditcard_df = spark.sql("SELECT CUST_CC_NO, TIMEID, CAST(CUST_SSN AS INT) CUST_SSN, \
CAST(BRANCH_CODE AS INT) BRANCH_CODE, TRANSACTION_TYPE, CAST(TRANSACTION_VALUE AS DOUBLE) TRANSACTION_VALUE, \
CAST(TRANSACTION_ID AS INT) TRANSACTION_ID FROM credit_df")

In [14]:
creditcard_df.printSchema()

root
 |-- CUST_CC_NO: string (nullable = true)
 |-- TIMEID: string (nullable = true)
 |-- CUST_SSN: integer (nullable = true)
 |-- BRANCH_CODE: integer (nullable = true)
 |-- TRANSACTION_TYPE: string (nullable = true)
 |-- TRANSACTION_VALUE: double (nullable = true)
 |-- TRANSACTION_ID: integer (nullable = true)



In [15]:
creditcard_df.show()

+----------------+--------+---------+-----------+----------------+-----------------+--------------+
|      CUST_CC_NO|  TIMEID| CUST_SSN|BRANCH_CODE|TRANSACTION_TYPE|TRANSACTION_VALUE|TRANSACTION_ID|
+----------------+--------+---------+-----------+----------------+-----------------+--------------+
|4210653349028689|20180214|123459988|        114|       Education|             78.9|             1|
|4210653349028689|20180320|123459988|         35|   Entertainment|            14.24|             2|
|4210653349028689|20180708|123459988|        160|         Grocery|             56.7|             3|
|4210653349028689|20180419|123459988|        114|   Entertainment|            59.73|             4|
|4210653349028689|20181010|123459988|         93|             Gas|             3.59|             5|
|4210653349028689|20180528|123459988|        164|       Education|             6.89|             6|
|4210653349028689|20180519|123459988|        119|   Entertainment|            43.39|             7|


In [16]:

creditcard_df.write.format("jdbc") \
.mode("append") \
.option("url", "jdbc:mysql://localhost:3306/creditcard_capstone") \
.option("dbtable", "creditcard_capstone.cdw_sapp_credit_card") \
.option("user", user) \
.option("password", password) \
.save()


In [17]:
spark.stop()

In [18]:
# ALTER TABLE `creditcard_capstone`.`cdw_sapp_credit_card` 
# CHANGE COLUMN `CUST_CC_NO` `CUST_CC_NO` VARCHAR(45) NULL DEFAULT NULL ,
# CHANGE COLUMN `TIMEID` `TIMEID` VARCHAR(45) NULL DEFAULT NULL ,
# CHANGE COLUMN `TRANSACTION_TYPE` `TRANSACTION_TYPE` VARCHAR(45) NULL DEFAULT NULL ;

