# AWS Glue Studio Notebook
##### You are now running a AWS Glue Studio notebook; To start using your notebook you need to start an AWS Glue Interactive Session.


#### Optional: Run this cell to see available notebook commands ("magics").


In [None]:
%help

####  Run this cell to set up and start your interactive session.


In [None]:
%idle_timeout 2880
%glue_version 5.0
%worker_type G.1X
%number_of_workers 5

import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
  
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

#### Example: Create a DynamicFrame from a table in the AWS Glue Data Catalog and display its schema


In [None]:
dyf = glueContext.create_dynamic_frame.from_catalog(database='etl-project-for-database', table_name='raw_data2')
dyf.printSchema()

#### Convert the DynamicFrame to a Spark DataFrame and display a sample of the data


In [None]:
df = dyf.toDF()
df.show()

#### Drop columns that we don't need it


In [None]:
df = df["id","year_birth","education","marital_status","income","dt_customer"]
df.show(10)

#### Check NaN values for each column


In [None]:
from pyspark.sql.functions import *

df.select([count(when(col(c).isNull(),c)).alias(c) for c in df.columns]).show()

There are 24 NaN values in "income" column. Let's fill NaN values with mean.

In [None]:
# Calculate the mean value of the column
mean_value = df.select(mean(col('income'))).collect()[0][0]

# Fill missing values with the mean value
df = df.fillna(mean_value, subset=['income'])

# Check
df.select([count(when(col(c).isNull(),c)).alias(c) for c in df.columns]).show()

#### Write the data to our S3 Bucket named "transformed_data" as csv.

In [None]:
df.write \
    .format("csv") \
    .mode("append") \
    .option("header", "true") \
    .save("s3://etl-bucket-s3/etl-bucket-s3-database/transformed_data/")

#### Write the data to our S3 Bucket named "transformed_data" as json.

In [None]:
df.write \
    .format("json") \
    .mode("append") \
    .save("s3://etl-project-for-medium/etl-project-for-medium-database/transformed_data/")