In [12]:
import pyspark
import warnings
warnings.filterwarnings("ignore")

### PySpark Dataframe

In [13]:
from pyspark.sql import SparkSession

In [16]:
spark = SparkSession.builder.appName('tutorial').getOrCreate()
spark

In [18]:
# Read dataset
data_path = 'data_telecom_churn2.csv'
df = spark.read.csv(data_path, header=True, inferSchema=True)
# Show the df
df.show(5)

+----------+------+----------------+--------------------+--------------+------------+-----+
|customerID|gender|PaperlessBilling|       PaymentMethod|MonthlyCharges|TotalCharges|Churn|
+----------+------+----------------+--------------------+--------------+------------+-----+
|7590-VHVEG|Female|             Yes|    Electronic check|         29.85|       29.85|   No|
|5575-GNVDE|  Male|              No|        Mailed check|         56.95|      1889.5|   No|
|3668-QPYBK|  Male|             Yes|        Mailed check|         53.85|      108.15|  Yes|
|7795-CFOCW|  Male|              No|Bank transfer (au...|          42.3|     1840.75|   No|
|9237-HQITU|Female|             Yes|    Electronic check|          70.7|      151.65|  Yes|
+----------+------+----------------+--------------------+--------------+------------+-----+
only showing top 5 rows



#### Check the datatypes of all columns

In [19]:
# Check the schema
df.printSchema()

root
 |-- customerID: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- PaperlessBilling: string (nullable = true)
 |-- PaymentMethod: string (nullable = true)
 |-- MonthlyCharges: double (nullable = true)
 |-- TotalCharges: string (nullable = true)
 |-- Churn: string (nullable = true)



In [26]:
df.dtypes

[('customerID', 'string'),
 ('gender', 'string'),
 ('PaperlessBilling', 'string'),
 ('PaymentMethod', 'string'),
 ('MonthlyCharges', 'double'),
 ('TotalCharges', 'string'),
 ('Churn', 'string')]

In [22]:
# Show the first 5 rows
df.head(4)

[Row(customerID='7590-VHVEG', gender='Female', PaperlessBilling='Yes', PaymentMethod='Electronic check', MonthlyCharges=29.85, TotalCharges='29.85', Churn='No'),
 Row(customerID='5575-GNVDE', gender='Male', PaperlessBilling='No', PaymentMethod='Mailed check', MonthlyCharges=56.95, TotalCharges='1889.5', Churn='No'),
 Row(customerID='3668-QPYBK', gender='Male', PaperlessBilling='Yes', PaymentMethod='Mailed check', MonthlyCharges=53.85, TotalCharges='108.15', Churn='Yes'),
 Row(customerID='7795-CFOCW', gender='Male', PaperlessBilling='No', PaymentMethod='Bank transfer (automatic)', MonthlyCharges=42.3, TotalCharges='1840.75', Churn='No')]

#### Select columns and indexes

In [25]:
# Show only 2 columns "customerID", "gender"
df.select(["customerID", "gender"]).show(10)

+----------+------+
|customerID|gender|
+----------+------+
|7590-VHVEG|Female|
|5575-GNVDE|  Male|
|3668-QPYBK|  Male|
|7795-CFOCW|  Male|
|9237-HQITU|Female|
|9305-CDSKC|Female|
|1452-KIOVK|  Male|
|6713-OKOMC|Female|
|7892-POOKP|Female|
|6388-TABGU|  Male|
+----------+------+
only showing top 10 rows



In [29]:
df.describe().show()

+-------+----------+------+----------------+--------------------+------------------+------------------+-----+
|summary|customerID|gender|PaperlessBilling|       PaymentMethod|    MonthlyCharges|      TotalCharges|Churn|
+-------+----------+------+----------------+--------------------+------------------+------------------+-----+
|  count|      7043|  7043|            7043|                7043|              7043|              7043| 7043|
|   mean|      null|  null|            null|                null| 64.76169246059922|2283.3004408418697| null|
| stddev|      null|  null|            null|                null|30.090047097678482| 2266.771361883145| null|
|    min|0002-ORFBO|Female|              No|Bank transfer (au...|             18.25|                  |   No|
|    max|9995-HOTOH|  Male|             Yes|        Mailed check|            118.75|             999.9|  Yes|
+-------+----------+------+----------------+--------------------+------------------+------------------+-----+



#### Drop a column

In [30]:
df = df.drop('PaymentMethod')

In [31]:
df.show(3)

+----------+------+----------------+--------------+------------+-----+
|customerID|gender|PaperlessBilling|MonthlyCharges|TotalCharges|Churn|
+----------+------+----------------+--------------+------------+-----+
|7590-VHVEG|Female|             Yes|         29.85|       29.85|   No|
|5575-GNVDE|  Male|              No|         56.95|      1889.5|   No|
|3668-QPYBK|  Male|             Yes|         53.85|      108.15|  Yes|
+----------+------+----------------+--------------+------------+-----+
only showing top 3 rows



#### Rename the columns

In [33]:
df.withColumnRenamed('paperlessBilling', 'Paperless').show(3)

+----------+------+---------+--------------+------------+-----+
|customerID|gender|Paperless|MonthlyCharges|TotalCharges|Churn|
+----------+------+---------+--------------+------------+-----+
|7590-VHVEG|Female|      Yes|         29.85|       29.85|   No|
|5575-GNVDE|  Male|       No|         56.95|      1889.5|   No|
|3668-QPYBK|  Male|      Yes|         53.85|      108.15|  Yes|
+----------+------+---------+--------------+------------+-----+
only showing top 3 rows

