#### Pandas On Spark -- Catalog.csv

In [0]:
# import pandas for spark
import pyspark.pandas as ps

In [0]:
# File location and type
file_location = "/FileStore/tables/test/catalog.csv"
file_type = "csv"

# CSV options
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

display(df)


In [0]:
# convert dataframe to pandas spark dataframe
psdf = ps.DataFrame(df)

In [0]:
# rows & cols
print("\n*** Rows & Cols ***")
print("Rows",psdf.shape[0])
print("Cols",psdf.shape[1])

In [0]:
# columns
print("\n*** Column Names ***")
print(psdf.columns)

In [0]:
# data types
print("\n*** Data Types ***")
print(psdf.dtypes)

In [0]:
# print
print(psdf)

In [0]:
# info
print(psdf.info())

In [0]:
# head
print(psdf.head())

In [0]:
# print 5 point summary, count, mean & std
print(psdf.describe())

In [0]:
# count
print("*** Count =======================")
print(psdf['Country'].count())

In [0]:
# unique count
print("*** Unique ======================")
print(psdf['Country'].nunique())

In [0]:
# summary
print("*** Describe ====================")
print(psdf['Country'].describe())


In [0]:
# group by count
print("*** GroupBy Count ===============")
print(psdf.groupby(['Country'])['Country'].count())

In [0]:
# group by sum
print("*** GroupBy Sum =================")
print(psdf.groupby(['Country'])['Price'].sum())

In [0]:
# group by min
print("*** GroupBy Min =================")
print(psdf.groupby(['Country'])['Price'].min())

In [0]:
# group by mean
print("*** GroupBy Mean ================")
print(psdf.groupby(['Country'])['Price'].mean())


In [0]:
# group by max
print("*** GroupBy Max =================")
print(psdf.groupby(['Country'])['Price'].max())

In [0]:
# country = USA
psdfn = psdf[psdf['Country'] == 'USA']
print(psdfn.head())# country = USA
psdfn = psdf[psdf['Country'] == 'USA']
print(psdfn.head())

In [0]:

# country = USA or UK
psdfn = psdf [ (psdf['Country'] == 'USA') | (psdf['Country'] == 'UK') ]
print(psdfn.head())


In [0]:
# Price >= 10
psdfn = psdf[ psdf['Price'] >= 10.0 ]
print(psdfn.head())

In [0]:
# count of rows with Price >= 10.0
var = psdf[ psdf['Price'] >= 10.0 ]['Price'].count()
print(var)

In [0]:
# add column with default value
psdf['Junk'] = "Junk" 
print(psdf.head())

In [0]:
# drop cols
print("\n*** Drop Cols ***")
psdf = psdf.drop('Junk', axis=1)
print(psdf.head())


In [0]:
# add column using calculation
psdf['Age'] = 2023 - psdf['Year'] 
print(psdf.head())

In [0]:
# add column based on values of other column
psdf['Label'] = ""
# selective update
psdf.loc[psdf['Age'] <= 30, 'Label'] = "Twentys"
psdf.loc[(psdf['Age'] > 30) & (psdf['Age'] <= 40), 'Label'] = "Thirtys"
psdf.loc[psdf['Age'] > 40, 'Label'] =  "Fortys"
#psdf['Label'] = np.where(psdf['Age']<=30, '', psdf['Label'])
#psdf['Label'] = np.where(psdf['Age']<=30, 'Twens', psdf['Label'])
#psdf['Label'] = np.where(psdf['Age']>30,'Oldie', psdf['Label'])
print(psdf)


In [0]:
# using for loop
for i,row in psdf.iterrows():
    print(psdf['Title'][i],"||",psdf['Artist'][i])


In [0]:
# artist starts with bob
psdfn = psdf [ psdf['Artist'].str.startswith("Bob") ]
print(psdfn)


In [0]:
# artist ends with rose
psdfn = psdf [ psdf['Artist'].str.endswith("Rose") ]
print(psdfn)


In [0]:
# artist contains ram
psdfn = psdf [ psdf['Artist'].str.contains("Ram") ]
print(psdfn)


In [0]:
# artist contains ram ignore case
import string
vString = "RAM"
# artist contains ram
psdfn = psdf [ psdf['Artist'].str.lower().str.contains(vString.lower()) ]
print(psdfn)


In [0]:
# add col
psdf['Number'] = 3.6
print(psdf.head())
print(psdf.info())

In [0]:
# change col data type
psdf['Number'] = psdf['Number'].astype(int)
print(psdf.head())
print(psdf.info())

In [0]:
# ren col
psdf=psdf.rename(columns = {'Number':'DelCol'})
print(psdf.info())

In [0]:
# del col
psdf = psdf.drop('DelCol', axis=1)
print(psdf.info())