In [1]:
# Import the required libraries and dependencies
import pandas as pd
import hvplot.pandas
import os
import tensorflow as tf

import findspark
findspark.init()

from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
# Import packages
from pyspark.sql import SparkSession
import time

# Create a SparkSession
spark = SparkSession.builder\
    .appName("SparkSQL")\
    .config("spark.sql.debug.maxToStringFields", 2000)\
    .config("spark.driver.memory", "2g")\
    .getOrCreate()

# Set the partitions to 4 or 8. 
spark.conf.set("spark.sql.shuffle.partitions", 8)

In [33]:
#  Import and read the Ghibli_characters.csv
df = pd.read_csv("Resources/Ghibli_characters.csv")
df.head()

Unnamed: 0,character name,age,height (cm),special powers,country / place of residence,gender,Species,movie,release date
0,Totoro,1302.0,215.0,"Flight, teleportation",Japan,Male,Spirit,My Neighbor Totoro,1988
1,Chihiro,10.0,120.0,,Japan,Female,Human,Spirited Away,2001
2,Haku,12.0,180.0,"Flight, shapeshifting",Japan,Male,Spirit (dragon),Spirited Away,2001
3,Princess Mononoke,15.0,150.0,,Japan,Female,Human,Princess Mononoke,1997
4,Ashitaka,17.0,170.0,"Super strength, agility",Japan,Male,Human,Princess Mononoke,1997


In [4]:
# Read in data from csv file
from pyspark import SparkFiles
file = "Resources/Ghibli_characters.csv"
spark.sparkContext.addFile(file)

In [5]:
# Assign csv file as a variable 
SG_characters = spark.read.csv(SparkFiles.get("Ghibli_characters.csv"), sep=",", header=True)

# Create a temp view from csv file
SG_characters.createOrReplaceTempView('SG_characters')

In [6]:
# Display SG_characters temp view
SG_characters.show()

+--------------------+----+-----------+--------------------+----------------------------+------+--------------------+--------------------+------------+
|      character name| age|height (cm)|      special powers|country / place of residence|gender|             Species|               movie|release date|
+--------------------+----+-----------+--------------------+----------------------------+------+--------------------+--------------------+------------+
|              Totoro|1302|        215|Flight, teleporta...|                       Japan|  Male|              Spirit|  My Neighbor Totoro|        1988|
|             Chihiro|  10|        120|                 N/A|                       Japan|Female|               Human|       Spirited Away|        2001|
|                Haku|  12|        180|Flight, shapeshif...|                       Japan|  Male|     Spirit (dragon)|       Spirited Away|        2001|
|   Princess Mononoke|  15|        150|                 N/A|                       Japan

In [7]:
# query distinct species
start_time = time.time()

species_df = spark.sql("""
select distinct Species, count(Species)
from SG_characters
group by Species
""")

species_df.show()

print("--- %s seconds ---" % (time.time() - start_time))

+--------------------+--------------+
|             Species|count(Species)|
+--------------------+--------------+
|        Wolf goddess|             1|
|           Human cat|             1|
|            Boar god|             1|
|              Spirit|             2|
|               Witch|             1|
|                Fish|             1|
|     Spirit (dragon)|             1|
|                 Cat|             2|
|              Wizard|             1|
|Mermaid-like crea...|             1|
|           Scarecrow|             1|
|       Moon princess|             1|
|       Forest spirit|             1|
|               Human|            40|
|          Fire demon|             1|
+--------------------+--------------+

--- 0.7102868556976318 seconds ---


In [8]:
# Display how many movies Studio Ghibli has made
spark.sql("""
select distinct movie
from SG_characters
""").show()

+--------------------+
|               movie|
+--------------------+
|Nausicaä of the V...|
|               Ponyo|
|       Spirited Away|
|Whisper of the Heart|
|Grave of the Fire...|
|   Princess Mononoke|
|  My Neighbor Totoro|
|When Marnie Was T...|
|The Tale of the P...|
|Kiki's Delivery S...|
|   Castle in the Sky|
|Howl's Moving Castle|
|     The Cat Returns|
|         Porco Rosso|
+--------------------+



In [9]:
# clean dataframe and drop non-beneficial 
cleanedCharacters = df.drop(columns = ['character name','movie', 'release date'], axis=1)
cleanedCharacters.head()

Unnamed: 0,age,height (cm),special powers,country / place of residence,gender,Species
0,1302.0,215.0,"Flight, teleportation",Japan,Male,Spirit
1,10.0,120.0,,Japan,Female,Human
2,12.0,180.0,"Flight, shapeshifting",Japan,Male,Spirit (dragon)
3,15.0,150.0,,Japan,Female,Human
4,17.0,170.0,"Super strength, agility",Japan,Male,Human


In [10]:
# Scale price data, return, and variance values
data_scaled = StandardScaler().fit_transform(
    cleanedCharacters[["age", "height (cm)"]]
)

In [11]:
# Create a DataFrame with the scaled data
df_data_scaled = pd.DataFrame(
    data_scaled,
    columns=["age", "height (cm)"]
)

# Display sample data
df_data_scaled.head()

Unnamed: 0,age,height (cm)
0,5.884276,2.080366
1,-0.269684,-0.396138
2,-0.260157,1.167969
3,-0.245868,0.385915
4,-0.236342,0.907285


In [12]:
# get_dummies to convert categorical values to numerical values
ghibli_dummies = pd.get_dummies(cleanedCharacters["special powers"])
ghibli_dummies.head()

Unnamed: 0,Fire manipulation,"Flight, broomstick flying","Flight, shapeshifting","Flight, teleportation","Shape-shifting, size manipulation",Shapeshifting,Shapeshifting into a cat,Shapeshifting into a fish,"Shapeshifting, flight, magic","Super strength, agility","Super strength, agility, communication with wolves","Super strength, agility, shapeshifting into a boar","Super strength, agility, telekinesis","Telekinesis, control over levitation crystals","Telepathy, communication with insects"
0,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False


In [13]:
print(ghibli_dummies)

    Fire manipulation  Flight, broomstick flying  Flight, shapeshifting  \
0               False                      False                  False   
1               False                      False                  False   
2               False                      False                   True   
3               False                      False                  False   
4               False                      False                  False   
5               False                      False                  False   
6               False                      False                  False   
7               False                      False                  False   
8               False                       True                  False   
9               False                      False                  False   
10              False                      False                  False   
11               True                      False                  False   
12              False    

In [14]:
# Look at Species value counts for binning
species_class = df['Species']
species_class.head()

0             Spirit
1              Human
2    Spirit (dragon)
3              Human
4              Human
Name: Species, dtype: object

In [15]:
# Create All Human species classifications to be replaced with 1
Humans = species_class[species_class == "Human"]

# Replace in dataframe
for species in Humans:
    df['Species'] = df['Species'].replace(species,"1")

# Check to make sure binning was successful
df['Species']

0                    Spirit
1                         1
2           Spirit (dragon)
3                         1
4                         1
5                         1
6                         1
7                       Cat
8                     Witch
9                    Wizard
10                        1
11               Fire demon
12                        1
13    Mermaid-like creature
14             Wolf goddess
15                        1
16                        1
17                        1
18                        1
19                Human cat
20                        1
21                      Cat
22                     Fish
23                        1
24                        1
25                        1
26            Moon princess
27                 Boar god
28                        1
29                        1
30                        1
31                        1
32                        1
33                   Spirit
34                        1
35                  

In [16]:
# Create All Human species classifications to be replaced with 1
OtherSpecies = species_class[species_class != "Human"]

# Replace in dataframe
for species in OtherSpecies:
    df['Species'] = df['Species'].replace(species,"0")

# Check to make sure binning was successful
df['Species']

0     0
1     1
2     0
3     1
4     1
5     1
6     1
7     0
8     0
9     0
10    1
11    0
12    1
13    0
14    0
15    1
16    1
17    1
18    1
19    0
20    1
21    0
22    0
23    1
24    1
25    1
26    0
27    0
28    1
29    1
30    1
31    1
32    1
33    0
34    1
35    1
36    1
37    1
38    1
39    0
40    1
41    0
42    1
43    1
44    1
45    1
46    1
47    1
48    1
49    1
50    1
51    1
52    1
53    1
54    1
55    1
Name: Species, dtype: object

In [17]:
# split data into features and target arrays
y = df['Species'].astype('float64')
X = ghibli_dummies.values

In [28]:
# show all X values
X[:5]

array([[False, False, False,  True, False, False, False, False, False,
        False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False],
       [False, False,  True, False, False, False, False, False, False,
        False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False, False,
         True, False, False, False, False, False]])

In [26]:
# show all y values
y[:10]

0    0.0
1    1.0
2    0.0
3    1.0
4    1.0
5    1.0
6    1.0
7    0.0
8    0.0
9    0.0
Name: Species, dtype: float64

In [20]:
# split data into training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=17)
print("Training Data Info")
print("Training Data Shape:", X_train.shape)
print("Training Data Labels Shape:", y_train.shape)

Training Data Info
Training Data Shape: (42, 15)
Training Data Labels Shape: (42,)


In [21]:
# Create Standard Scaler
scaler = StandardScaler()

# Fit scaler to training data
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [29]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=10, activation='relu', input_dim=15))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=7, activation='relu'))

# Third hidden layer
nn.add(tf.keras.layers.Dense(units=13, activation='tanh'))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

# Check the structure of the model
nn.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_4 (Dense)             (None, 10)                160       
                                                                 
 dense_5 (Dense)             (None, 7)                 77        
                                                                 
 dense_6 (Dense)             (None, 13)                104       
                                                                 
 dense_7 (Dense)             (None, 1)                 14        
                                                                 
Total params: 355 (1.39 KB)
Trainable params: 355 (1.39 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [30]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer='adam', metrics=['accuracy'])

In [31]:
# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [32]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

1/1 - 0s - loss: 0.5357 - accuracy: 0.7857 - 115ms/epoch - 115ms/step
Loss: 0.5357179641723633, Accuracy: 0.7857142686843872
