The following code in this notebook was added to the 'mount_s3_and_get_data' notebook on the Databricks platform and used to clean the dataframes created by running that notebook.

## Clean df_pin dataframe

In [None]:
def add_nulls_to_dataframe_column(dataframe, column, value_to_replace):
    '''Converts matched values in column of dataframe to null based on expression'''
    dataframe = dataframe.withColumn(column, when(col(column).like(value_to_replace), None).otherwise(col(column)))
    return dataframe

In [None]:
# replace empty entries and entries with no relevant data in each column with Nones
# column names and values to change to null
columns_and_values_for_null = {
    "description": "No description available%",
    "follower_count": "User Info Error",
    "image_src": "Image src error.",
    "poster_name": "User Info Error",
    "tag_list": "N,o, ,T,a,g,s, ,A,v,a,i,l,a,b,l,e",
    "title": "No Title Data Available"
}
# loop through dictionary, calling function with dictionary values as arguments
for key, value in columns_and_values_for_null.items():
    df_pin = add_nulls_to_dataframe_column(df_pin, key, value)
# Perform the necessary transformations on the follower_count to ensure every entry is a number
df_pin = df_pin.withColumn("follower_count", regexp_replace("follower_count", "k", "000"))
df_pin = df_pin.withColumn("follower_count", regexp_replace("follower_count", "M", "000000"))
# cast follower_count column to integer type
df_pin = df_pin.withColumn("follower_count", col("follower_count").cast('int'))
# convert save_location column to include only the save location path
df_pin = df_pin.withColumn("save_location", regexp_replace("save_location", "Local save in ", ""))
# rename the index column to ind
df_pin = df_pin.withColumnRenamed("index", "ind")
# reorder columns
new_pin_column_order = [
    "ind",
    "unique_id",
    "title",
    "description",
    "follower_count",
    "poster_name",
    "tag_list",
    "is_image_or_video",
    "image_src",
    "save_location",
    "category"
]
df_pin = df_pin.select(new_pin_column_order)

In [None]:
# display changes
df_pin.limit(50).display()
df_pin.printSchema()

ind,unique_id,title,description,follower_count,poster_name,tag_list,is_image_or_video,image_src,save_location,category
6717,bc5ab9ee-505e-44f6-92ba-677fe4fdf3e3,〚 Уютные шведские коттеджи от Carina Olander 〛 ◾ Фото ◾ Идеи ◾ Дизайн,"Традиционные шведские коттеджи, обычно с красным фасадом — это настоящее воплощением идеального зимнего уюта. Они обычно оформлены очень просто и ✌PUFIK. Beautiful Interiors. On…",136000,PUFIK Interiors & Inspirations,"Scandinavian Cottage,Swedish Cottage,Swedish Home Decor,Swedish Farmhouse,Swedish Style,Swedish Kitchen,Kitchen Black,Swedish House,Cozy Cottage",image,https://i.pinimg.com/originals/32/eb/72/32eb72e4fd8654c115a64528bd1f34b4.png,/data/home-decor,home-decor
6633,d136f6bc-840d-44f8-bbad-115eb7e6c51e,The Cottage Journal on Instagram: “Can you say color?! 😍😍😍 We are loving the cheery vibes that these aqua blue cabinets are giving. If you could paint your cabinets any…”,"6,636 Likes, 141 Comments - The Cottage Journal (@thecottagejournal) on Instagram: “Can you say color?! 😍😍😍 We are loving the cheery vibes that these aqua blue cabinets are g…",394,Sarah Martin,"Diy Kitchen Cabinets,Kitchen Redo,Home Decor Kitchen,New Kitchen,Home Kitchens,Kitchen Remodeling,Aqua Kitchen,Kitchen Counters,Kitchen Islands",image,https://i.pinimg.com/originals/8c/17/a2/8c17a257b70780480bb89c3699363144.jpg,/data/home-decor,home-decor
1706,b5c8a1b5-9e90-4522-9bec-2477b698d5b7,Standing Figurine Toys Xmas Santa Claus Snowman Reindeer Figure Plush Dolls Christmas Decorations Ornaments Home Indoor Table Ornaments Christmas Party Tree Hanging Decor Toys Gifts for Kids Friends…,"Features: Material:Lint Size:48ｘ18cm Quantity:1 pc Shape:Santa Claus, snowman. Elk Occasion:Christmas Description: 1. Fashion design, high quality 2. Santa Claus, snowman. Elk C…",5000,Wear24-7,"Merry Christmas To You,Christmas Toys,Great Christmas Gifts,Christmas Snowman,Christmas Ornaments,Holiday,Christmas Party Decorations,Christmas Themes,Decoration Party",image,https://i.pinimg.com/originals/b5/7f/21/b57f219fa89c1165b57525b8eae711da.jpg,/data/christmas,christmas
1676,3ed92c2d-9cca-4ccf-ac25-44a9d8bec919,PAUPOO™ Pre-lit Artificial Christmas Hanging Basket - Flocked with Mixed Decorations and White LED Lights - Frosted Berry BUY 5 GET 2 FREE(7PACKS),"❤️ ❤️ MERRY CHRISTMAS ❤️ ❤️ ❤️ ❤️ Early Christmas Special:Buy 3 Get 1 Free, Buy 5 Get 2 Free,Deadline November 25. Color:GreenMaterial:Polyvinyl ChlorideItem Dimensions:LxWxH 20…",784,paupoo,"Christmas Hanging Baskets,Christmas Plants,Christmas Wreaths,Christmas Ornaments,Merry Christmas,Christmas Sale,Christmas Porch Ideas,Hanging Christmas Lights,Christmas Island",image,https://i.pinimg.com/originals/ef/40/7e/ef407e9568aa46fed4162bd1fd28786e.jpg,/data/christmas,christmas
2482,08604f20-fa17-4b9a-9949-781717eca6cd,FORNT PORCH CHRISTMAS DECORATING IDEAS,"Christmas decorating ideas for porches. Beautiful holiday decor ideas for front porches both small and large. Outdoor decorations like sleds, lanterns, Christmas trees, wreaths,…",46000,"Life on Summerhill | Home, Holiday Decor & DIY Website","Diy Christmas Decorations For Home,Farmhouse Christmas Decor,Christmas Home,Christmas Holidays,Christmas Front Porches,How To Decorate For Christmas,Christmas Porch Ideas,Christmas Decorating Ideas,Large Outdoor Christmas Decorations",video,https://i.pinimg.com/videos/thumbnails/originals/40/83/f5/4083f5b4971bf235f89a4784ab87271e.0000001.jpg,/data/christmas,christmas
2430,7e16143b-d84a-40a3-a617-e736b728df5d,"9ft Lighted Winter Garland with Bows and Pine Accents, Indoor or Outdoor Christmas Decor","About This Item We aim to show you accurate product information. Manufacturers, suppliers and others provide what you see here, and we have not verified it. 9ft Lighted Winter G…",5000,Wear24-7,"Outside Christmas Decorations,Christmas House Lights,Holiday Decor,Decorating For Christmas Outdoors,Christmas Outdoor Lights,Fireplace Mantel Christmas Decorations,Exterior Christmas Lights,Christmas Garlands,Beautiful Christmas Decorations",image,https://i.pinimg.com/originals/ac/28/79/ac28794ec86c522658775b03e93e8cc9.jpg,/data/christmas,christmas
4348,c25f8906-d9ab-414c-90d6-a5322f714e44,Surprise Birthday Cards 40th 50th 60th 70th 75th 80th 85th 90th Birthday Cards Fifty Fabulous Birthday Party Forty Birthday Invitation,"""adult Surprise Birthday Cards 40th 50th 60th 70th 75th 80th 85th 90th Birthday Cards Fifty Fabulous Birthday Party Forty Birthday Invitation 50th 60th 70th 80th Surprise Birthd…",4000000,Etsy,"90th Birthday Cards,Forty Birthday,Birthday Postcards,70th Birthday Parties,Happy Birthday Messages,Surprise Birthday,Birthday Ideas,Surprise Party Invitations,Passport Wedding Invitations",image,https://i.pinimg.com/originals/9f/a7/b1/9fa7b1fe33555b81d050e6ebf6bb8871.jpg,/data/event-planning,event-planning
6521,dd508c7e-4ff3-4a94-94cb-abc1327c1f58,Allan Copley Designs Calligraphy Square Glass Top End Table in Espresso Finish with Brushed Stainless Steel Accents by Allan Copley Designs - Espresso,Description The Calligraphy Collection by Allan Copley Designs is meticulously crafted with great attention to detail. The Espresso on Kulin finish with Brushed Stainless Steel…,137,"Your Home, Reimagined LLC","Glass Top End Tables,End Table Sets,Sofa End Tables,End Tables With Storage,Side Tables,Metal Furniture,Rustic Furniture,Living Room Furniture,Luxury Furniture",image,https://i.pinimg.com/originals/5f/4f/9f/5f4f9ff6154aba517a852ceb967e2c11.jpg,/data/home-decor,home-decor
6521,dd508c7e-4ff3-4a94-94cb-abc1327c1f58,Allan Copley Designs Calligraphy Square Glass Top End Table in Espresso Finish with Brushed Stainless Steel Accents by Allan Copley Designs - Espresso,Description The Calligraphy Collection by Allan Copley Designs is meticulously crafted with great attention to detail. The Espresso on Kulin finish with Brushed Stainless Steel…,137,"Your Home, Reimagined LLC","Glass Top End Tables,End Table Sets,Sofa End Tables,End Tables With Storage,Side Tables,Metal Furniture,Rustic Furniture,Living Room Furniture,Luxury Furniture",image,https://i.pinimg.com/originals/5f/4f/9f/5f4f9ff6154aba517a852ceb967e2c11.jpg,/data/home-decor,home-decor
10538,5d9fa7e2-2118-4442-99b6-537d60463a6a,BC Customs (BCC) Search and Rescue Tactical Vehicle-5 (SRTV-5) Baja Racing-Type All-Terrain Combat Vehicle Armed/Weaponized with 7.62mm NATO Garwood Industries (GI) M134G Minigun/Gatling Gun: SXOR…,"By David Crane ; defrev (at) gmail (dot) com All photos contained in this article were shot by DefenseReview.com (DR), and are copyrighted. DefenseReview.com owns the copyright…",709,Ricky Lee,"Army Vehicles,Armored Vehicles,Cool Trucks,Cool Cars,Amphibious Vehicle,Offroader,Bug Out Vehicle,Vehicle Wraps,Terrain Vehicle",image,https://i.pinimg.com/originals/36/63/12/366312d747da1358397610a86bf21b20.jpg,/data/vehicles,vehicles


## Clean df_geo dataframe

In [None]:
# import types
from pyspark.sql.types import ArrayType, DoubleType
# define function for returning list containing two values
def combine_lat_and_long(latitude, longitude):
    return [latitude, longitude]
# define new user-defined function
new_func = udf(combine_lat_and_long, ArrayType(DoubleType()))
# apply new udf to combine latitude and longitude columns
df_geo = df_geo.withColumn("coordinates", new_func("latitude", "longitude"))
# drop the latitude and longitude columns
cols_to_drop = ("latitude", "longitude")
df_geo = df_geo.drop(*cols_to_drop)
# convert timestamp column from type string to type timestamp
df_geo = df_geo.withColumn("timestamp", to_timestamp("timestamp"))
# change column order
new_geo_column_order = [
    "ind",
    "country",
    "coordinates",
    "timestamp",
]
df_geo = df_geo.select(new_geo_column_order)

In [None]:
# display changes
df_geo.limit(50).display()
df_geo.printSchema()

ind,country,coordinates,timestamp
9455,British Indian Ocean Territory (Chagos Archipelago),"List(-82.9272, -150.346)",2022-03-15T01:46:32.000+0000
6814,British Indian Ocean Territory (Chagos Archipelago),"List(-86.5675, -149.565)",2022-09-02T11:34:28.000+0000
5111,British Indian Ocean Territory (Chagos Archipelago),"List(-83.7472, 8.65953)",2021-04-01T00:56:57.000+0000
10073,Antarctica (the territory South of 60 deg S),"List(-32.8885, -170.295)",2021-06-29T19:56:04.000+0000
2418,Antarctica (the territory South of 60 deg S),"List(-88.4642, -171.061)",2022-05-27T11:30:59.000+0000
5162,Antarctica (the territory South of 60 deg S),"List(-71.6607, -149.206)",2019-09-27T19:06:43.000+0000
1335,Antarctica (the territory South of 60 deg S),"List(-77.9931, -175.682)",2022-03-19T17:29:42.000+0000
9185,Antarctica (the territory South of 60 deg S),"List(-10.3764, -22.9809)",2019-10-06T18:12:55.000+0000
9335,Antarctica (the territory South of 60 deg S),"List(-88.4642, -171.061)",2020-11-14T23:42:22.000+0000
6749,Antarctica (the territory South of 60 deg S),"List(-88.4642, -171.061)",2018-04-16T07:39:46.000+0000


## Clean df_pin dataframe

In [None]:
# create new column for full name
df_user = df_user.withColumn("user_name", concat_ws(" ", "first_name", "last_name"))
# drop the first_name and last_name columns
cols_to_drop = ("first_name", "last_name")
df_user = df_user.drop(*cols_to_drop)
# convert date_joined column from type string to type timestamp
df_user = df_user.withColumn("date_joined", to_timestamp("date_joined"))
# change column order
new_user_column_order = [
    "ind",
    "user_name",
    "age",
    "date_joined",
]
df_user = df_user.select(new_user_column_order)

In [None]:
# display changes
df_user.limit(50).display()
df_user.printSchema()

ind,user_name,age,date_joined
2015,Christopher Bradshaw,27,2016-03-08T13:38:37.000+0000
10673,Alexander Cervantes,59,2017-05-12T21:22:17.000+0000
1857,Christopher Hamilton,48,2016-02-27T16:57:44.000+0000
10020,Christopher Hawkins,45,2016-09-15T06:02:53.000+0000
2041,Christopher Campbell,35,2015-10-22T22:42:23.000+0000
7031,Christopher Anderson,48,2016-06-13T17:09:14.000+0000
6398,Christina Davenport,39,2016-06-29T20:43:59.000+0000
3599,Alexandria Alvarado,20,2015-10-23T04:13:23.000+0000
4256,Alexandria Alvarado,20,2015-10-23T04:13:23.000+0000
1901,Michelle Richardson,44,2016-12-18T16:05:39.000+0000
