In [None]:
import sys
import os
import findspark
findspark.init()

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [None]:
import spark
import pyspark

In [None]:
from pyspark import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.sql.types import *
from pyspark.sql import functions as F
from pyspark.sql.functions import *

In [None]:
import numpy as np
import pandas as pd
from getpass import getpass
# Libraries to help with data visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# Command to tell Python to actually display the graphs
%matplotlib inline

In [None]:
# get the database username and password from secret.txt
secrets_file = os.path.join("files", "secret.txt")
with open(secrets_file, "r") as file1:
    secret_lines = file1.readlines()
for line in secret_lines:
    words = line.split("=")
    if (words[0] == "user"):
        user = words[1].strip()
    elif (words[0] == "password"):
        password = words[1].strip()
file1.close()

In [None]:
spark = SparkSession.builder.appName('req3_data_visual').getOrCreate()

In [None]:
#creating spark dataframes, from database tables

df_cc = spark.read\
     .format("jdbc")\
     .option("url", "jdbc:mysql://localhost:3306/creditcard_capstone")\
     .option("dbtable", "creditcard_capstone.cdw_sapp_credit_card")\
     .option("user", user)\
     .option("password", password)\
     .load()
df_cc.count()

In [None]:
df_branch = spark.read\
     .format("jdbc")\
     .option("url", "jdbc:mysql://localhost:3306/creditcard_capstone")\
     .option("dbtable", "creditcard_capstone.cdw_sapp_branch")\
     .option("user", user)\
     .option("password", password)\
     .load()
df_branch.count()

In [None]:
df_customer = spark.read\
     .format("jdbc")\
     .option("url", "jdbc:mysql://localhost:3306/creditcard_capstone")\
     .option("dbtable", "creditcard_capstone.cdw_sapp_customer")\
     .option("user", user)\
     .option("password", password)\
     .load()
df_customer.count()

In [None]:
#converting spark dataframes to panda dataframes
pd_df_cc = df_cc.toPandas()
pd_df_branch = df_branch.toPandas()
pd_df_customer = df_customer.toPandas()

In [None]:
pd_df_cc.isnull().sum()#checking for null values

In [None]:
pd_df_branch.isnull().sum()

In [None]:
pd_df_customer.isnull().sum()

In [None]:
pd_df_cc.head(1)

In [None]:
highest_transaction_type = pd_df_cc['TRANSACTION_TYPE'].value_counts().sort_values(ascending=False).index[0]
highest_transaction_count = pd_df_cc['TRANSACTION_TYPE'].value_counts().sort_values(ascending=False)[0]
print("Transaction type with highest count is \"{0}\" with a count of {1}".format(highest_transaction_type, highest_transaction_count))

In [None]:
pd_df_cc['TRANSACTION_TYPE'].value_counts()

In [None]:
px.histogram(pd_df_cc, x='TRANSACTION_TYPE')

In [None]:
pd_df_customer.head(1)

In [None]:
pd_df_customer['CUST_STATE'].value_counts()

In [None]:
highest_cust_state = pd_df_customer['CUST_STATE'].value_counts().sort_values(ascending=False).index[0]
highest_cust_count = pd_df_customer['CUST_STATE'].value_counts().sort_values(ascending=False)[0]
print("State type with highest customers is \"{0}\" with a count of {1}".format(highest_cust_state, highest_cust_count))

In [None]:
px.histogram(pd_df_customer, y='CUST_STATE')

In [None]:
pd_df_cust_tx_val = pd_df_cc.groupby(by='CUST_SSN')['TRANSACTION_VALUE'].sum().sort_values(ascending=False).head(20).reset_index()

In [None]:
#https://plotly.com/python/axes/ for reference (to update the axes in plotly)

#fig = px.bar(pd_df_cust_tx_val,x='CUST_SSN',y='TRANSACTION_VALUE', color='TRANSACTION_VALUE',height=800)
#fig = px.bar(pd_df_cust_tx_val,x='CUST_SSN',y='TRANSACTION_VALUE')
fig = px.scatter(pd_df_cust_tx_val,x='CUST_SSN',y='TRANSACTION_VALUE')
fig.update_xaxes(type='category')
fig.show()


Find and plot the top three months with the largest transaction data.

In [None]:
pd_df_cc['MONTHYEAR'] = pd_df_cc['TIMEID'].str[:-2]

In [None]:
pd_top3 = pd_df_cc.groupby(['MONTHYEAR'])['TRANSACTION_VALUE'].sum().sort_values(ascending=False).head(3).to_frame().reset_index()

In [None]:
pd_top3

In [None]:
fig = px.scatter(pd_top3, x='MONTHYEAR', y='TRANSACTION_VALUE')
#fig = px.bar(pd_top3, x='MONTHYEAR', y='TRANSACTION_VALUE')
fig.update_xaxes(type='category')
#fig.update_yaxes(dtick=25000)
fig.show()

In [None]:
#Need to find plotly version.
#px.bar(pd_top3, x='month', y='TRANSACTION_VALUE')

In [None]:
pd_df_cc_healthcare = pd_df_cc[pd_df_cc['TRANSACTION_TYPE'] == 'Healthcare']

In [None]:
pd_df_cc_healthcare.groupby(by='BRANCH_CODE')['TRANSACTION_VALUE'].sum().sort_values(ascending=False)

In [None]:
df_br_hc_tx = pd_df_cc_healthcare.groupby(by='BRANCH_CODE')['TRANSACTION_VALUE'].sum().sort_values(ascending=False).head(20).reset_index()

In [None]:
df_br_hc_tx

In [None]:
fig = px.bar(df_br_hc_tx, x='BRANCH_CODE', y='TRANSACTION_VALUE')
fig.update_xaxes(type='category')
fig.show()

In [None]:
spark.stop()