<a href="https://colab.research.google.com/github/vvikasreddy/gt-nlp-class/blob/master/Spark2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
from pyspark.sql import SparkSession

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
spark = SparkSession.builder.appName("Customers").getOrCreate()

In [14]:
df = spark.read.option('header', "true").option("inferSchema", "true").csv("/content/drive/MyDrive/data_spark/customers-100.csv")

In [15]:
df.show()


+-----+---------------+----------+---------+--------------------+-----------------+--------------------+--------------------+--------------------+--------------------+-----------------+--------------------+
|Index|    Customer Id|First Name|Last Name|             Company|             City|             Country|             Phone 1|             Phone 2|               Email|Subscription Date|             Website|
+-----+---------------+----------+---------+--------------------+-----------------+--------------------+--------------------+--------------------+--------------------+-----------------+--------------------+
|    1|DD37Cf93aecA6Dc|    Sheryl|   Baxter|     Rasmussen Group|     East Leonard|               Chile|        229.077.5154|    397.884.0519x718|zunigavanessa@smi...|       2020-08-24|http://www.stephe...|
|    2|1Ef7b82A4CAAD10|   Preston|   Lozano|         Vega-Gentry|East Jimmychester|            Djibouti|          5153435776|    686-620-1820x944|     vmata@colon.com|     

In [16]:
df.printSchema()

root
 |-- Index: integer (nullable = true)
 |-- Customer Id: string (nullable = true)
 |-- First Name: string (nullable = true)
 |-- Last Name: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Phone 1: string (nullable = true)
 |-- Phone 2: string (nullable = true)
 |-- Email: string (nullable = true)
 |-- Subscription Date: date (nullable = true)
 |-- Website: string (nullable = true)



In [43]:
df = spark.read.csv("/content/drive/MyDrive/data_spark/customers-100.csv", header=True, inferSchema=True)

In [None]:
# the returned type is the Spark dataframe, nothing but a data structure.

In [44]:
df.head(3)

[Row(Index=1, Customer Id='DD37Cf93aecA6Dc', First Name='Sheryl', Last Name='Baxter', Company='Rasmussen Group', City='East Leonard', Country='Chile', Phone 1='229.077.5154', Phone 2='397.884.0519x718', Email='zunigavanessa@smith.info', Subscription Date=datetime.date(2020, 8, 24), Website='http://www.stephenson.com/'),
 Row(Index=2, Customer Id='1Ef7b82A4CAAD10', First Name='Preston', Last Name='Lozano', Company='Vega-Gentry', City='East Jimmychester', Country='Djibouti', Phone 1='5153435776', Phone 2='686-620-1820x944', Email='vmata@colon.com', Subscription Date=datetime.date(2021, 4, 23), Website='http://www.hobbs.com/'),
 Row(Index=3, Customer Id='6F94879bDAfE5a6', First Name='Roy', Last Name='Berry', Company='Murillo-Perry', City='Isabelborough', Country='Antigua and Barbuda', Phone 1='+1-539-402-0259', Phone 2='(496)978-3969x58947', Email='beckycarr@hogan.com', Subscription Date=datetime.date(2020, 3, 25), Website='http://www.lawrence.com/')]

In [45]:
df.select(["Website", "Country"]).show()

+--------------------+--------------------+
|             Website|             Country|
+--------------------+--------------------+
|http://www.stephe...|               Chile|
|http://www.hobbs....|            Djibouti|
|http://www.lawren...| Antigua and Barbuda|
|http://www.good-l...|  Dominican Republic|
|https://goodwin-i...|Slovakia (Slovak ...|
|http://www.berger...|Bosnia and Herzeg...|
| https://www.le.com/|    Pitcairn Islands|
|https://hammond-r...|            Bulgaria|
|https://www.bullo...|              Cyprus|
|  https://arias.com/|         Timor-Leste|
|https://simmons-h...|            Guernsey|
|http://www.dougla...|             Vietnam|
|http://www.beck.com/|                Togo|
|https://www.brand...|           Sri Lanka|
|http://stevenson....|           Singapore|
|  http://acosta.org/|                Oman|
|http://www.benson...|      Western Sahara|
|http://pitts-cher...|          Mozambique|
|https://mcconnell...|South Georgia and...|
|https://www.camac...|    French

In [46]:
 df['City']

Column<'City'>

In [47]:
from pyspark.sql.functions import col, concat, lit

# Correct way to concatenate a column and a string literal
df = df.withColumn(
    'Exp after  years',
    concat(col('Customer Id'), lit(" hello "))
)

# Note: Using col('Customer Id') is the same as df['Customer Id']
# but is often considered a better practice for clarity and avoiding ambiguity.

In [48]:
df.show()

+-----+---------------+----------+---------+--------------------+-----------------+--------------------+--------------------+--------------------+--------------------+-----------------+--------------------+--------------------+
|Index|    Customer Id|First Name|Last Name|             Company|             City|             Country|             Phone 1|             Phone 2|               Email|Subscription Date|             Website|    Exp after  years|
+-----+---------------+----------+---------+--------------------+-----------------+--------------------+--------------------+--------------------+--------------------+-----------------+--------------------+--------------------+
|    1|DD37Cf93aecA6Dc|    Sheryl|   Baxter|     Rasmussen Group|     East Leonard|               Chile|        229.077.5154|    397.884.0519x718|zunigavanessa@smi...|       2020-08-24|http://www.stephe...|DD37Cf93aecA6Dc h...|
|    2|1Ef7b82A4CAAD10|   Preston|   Lozano|         Vega-Gentry|East Jimmychester|     

In [49]:
df.drop('Exp after  years').show()

+-----+---------------+----------+---------+--------------------+-----------------+--------------------+--------------------+--------------------+--------------------+-----------------+--------------------+
|Index|    Customer Id|First Name|Last Name|             Company|             City|             Country|             Phone 1|             Phone 2|               Email|Subscription Date|             Website|
+-----+---------------+----------+---------+--------------------+-----------------+--------------------+--------------------+--------------------+--------------------+-----------------+--------------------+
|    1|DD37Cf93aecA6Dc|    Sheryl|   Baxter|     Rasmussen Group|     East Leonard|               Chile|        229.077.5154|    397.884.0519x718|zunigavanessa@smi...|       2020-08-24|http://www.stephe...|
|    2|1Ef7b82A4CAAD10|   Preston|   Lozano|         Vega-Gentry|East Jimmychester|            Djibouti|          5153435776|    686-620-1820x944|     vmata@colon.com|     

In [50]:
df.describe().show()

+-------+------------------+---------------+----------+---------+--------------------+-------------+--------+--------------------+-------------------+--------------------+--------------------+--------------------+
|summary|             Index|    Customer Id|First Name|Last Name|             Company|         City| Country|             Phone 1|            Phone 2|               Email|             Website|    Exp after  years|
+-------+------------------+---------------+----------+---------+--------------------+-------------+--------+--------------------+-------------------+--------------------+--------------------+--------------------+
|  count|               100|            100|       100|      100|                 100|          100|     100|                 100|                100|                 100|                 100|                 100|
|   mean|              50.5|           NULL|      NULL|     NULL|                NULL|         NULL|    NULL|      5.1711689515E9|    6.22990414