In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [3]:
spark = (SparkSession.builder
                     .enableHiveSupport()
                     .getOrCreate()
         )

In [4]:
df_csv = spark.read.csv("s3a://tmp/Churn.csv",
                        header=True,
                        inferSchema=True,
                        sep=";")
df_csv.show(5)

+-----------+---------+------+---+------+--------+-------------+---------+--------------+---------------+------+
|CreditScore|Geography|Gender|Age|Tenure| Balance|NumOfProducts|HasCrCard|IsActiveMember|EstimatedSalary|Exited|
+-----------+---------+------+---+------+--------+-------------+---------+--------------+---------------+------+
|        619|   France|Female| 42|     2|       0|            1|        1|             1|       10134888|     1|
|        608|    Spain|Female| 41|     1| 8380786|            1|        0|             1|       11254258|     0|
|        502|   France|Female| 42|     8| 1596608|            3|        1|             0|       11393157|     1|
|        699|   France|Female| 39|     1|       0|            2|        0|             0|        9382663|     0|
|        850|    Spain|Female| 43|     2|12551082|            1|        1|             1|         790841|     0|
+-----------+---------+------+---+------+--------+-------------+---------+--------------+-------

In [5]:
df_csv.select(
    F.col('Gender'),
    F.col('Age')
).show(5)

+------+---+
|Gender|Age|
+------+---+
|Female| 42|
|Female| 41|
|Female| 42|
|Female| 39|
|Female| 43|
+------+---+
only showing top 5 rows



In [7]:
df_csv.select(
    F.col('Geography')
).distinct().show()

+---------+
|Geography|
+---------+
|  Germany|
|   France|
|    Spain|
+---------+



In [13]:
(
    df_csv
    .filter((F.col('Balance') >= 1596608) & 
            (F.col('Geography') == 'France') |
            (F.col('Gender') == 'Male') &
            (F.col('Age') == 25))
    .orderBy(F.col('Balance').desc())
).show()

+-----------+---------+------+---+------+--------+-------------+---------+--------------+---------------+------+
|CreditScore|Geography|Gender|Age|Tenure| Balance|NumOfProducts|HasCrCard|IsActiveMember|EstimatedSalary|Exited|
+-----------+---------+------+---+------+--------+-------------+---------+--------------+---------------+------+
|        583|   France|Female| 57|     3|23838756|            1|        0|             1|       14796499|     1|
|        592|   France|  Male| 37|     4|21269297|            1|        0|             0|       17639502|     0|
|        541|   France|  Male| 37|     9|21231403|            1|        0|             1|       14881454|     0|
|        850|   France|Female| 35|     1|21177431|            1|        1|             0|       18857412|     1|
|        596|   France|  Male| 21|     4|21043308|            2|        0|             1|       19729777|     1|
|        479|   France|Female| 33|     2|20816553|            1|        0|             0|       

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = (SparkSession.builder
                    .enableHiveSupport()
                    .getOrCreate()
        )

df_csv = spark.read.csv("s3a://tmp/Churn.csv",
                        header=True,
                        inferSchema=True,
                        sep=";")
df_csv = df_csv.withColumn('EstimatedSalary', F.round(F.col('EstimatedSalary'), 2))
df_csv.show(5)


In [18]:
(
    df_csv
    .select(
        F.col("Geography").alias("Cidade"),
        F.col("Gender").alias("Sexo"),
        F.col("Age").alias("Idade"),
        F.col("Balance").alias("Saldo"),
        F.col("EstimatedSalary").alias("SalarioAnual")
    )
    .withColumn("SalarioMensal", F.round(F.col("SalarioAnual") / 12 * 5.6, 2))
    .orderBy(F.col("SalarioMensal").desc())
    .drop("SalarioAnual")
    .show(10)
)

+-------+------+-----+--------+-------------+
| Cidade|  Sexo|Idade|   Saldo|SalarioMensal|
+-------+------+-----+--------+-------------+
|  Spain|Female|   42|       0|    9332982.4|
|Germany|Female|   41| 9863577|   9331967.87|
|  Spain|  Male|   27| 1533251|    9331155.4|
| France|  Male|   37|17875584|   9330027.93|
| France|  Male|   28|       0|    9329101.6|
| France|Female|   40|16529812|   9326928.33|
|  Spain|Female|   35| 4016988|   9326681.93|
| France|  Male|   36|15542317|   9325928.27|
|Germany|Female|   34| 9324442|   9324262.73|
| France|Female|   29|       0|    9322864.6|
+-------+------+-----+--------+-------------+
only showing top 10 rows

