In [0]:
import pandas as pd
# Create a sample DataFrame
data = {
"Size": ["Medium", "Large", None, "Small", "Medium", "Large", "Small", None,
         "Medium", "Large"],
"Color": ["RED", "", "BLUE", "Green", "Red", "Blue", "", "Green", "Purple",
"Red"],
"Pattern": ["Striped", "Polka dot", None, "Argyle", "Checkered", "Graphic",
"Solid", None, "Polka dot", "Striped"],
"Material": ["Cotton", "123", "Wool", "Bamboo", "Cotton", "Polyester",
"Nylon", "123", "Spandex", "Wool"]
}
df = pd.DataFrame(data)
display(df)

Size,Color,Pattern,Material
Medium,RED,Striped,Cotton
Large,,Polka dot,123
,BLUE,,Wool
Small,Green,Argyle,Bamboo
Medium,Red,Checkered,Cotton
Large,Blue,Graphic,Polyester
Small,,Solid,Nylon
,Green,,123
Medium,Purple,Polka dot,Spandex
Large,Red,Striped,Wool


In [0]:
df['Size'] = df['Size'].fillna(df['Size'].mode()[0])
df['Pattern'] = df['Pattern'].fillna(df['Pattern'].mode()[0])
display(df)

Size,Color,Pattern,Material
Medium,RED,Striped,Cotton
Large,,Polka dot,123
Large,BLUE,Polka dot,Wool
Small,Green,Argyle,Bamboo
Medium,Red,Checkered,Cotton
Large,Blue,Graphic,Polyester
Small,,Solid,Nylon
Large,Green,Polka dot,123
Medium,Purple,Polka dot,Spandex
Large,Red,Striped,Wool


In [0]:
df = df[df['Color'] != ""]
display(df)

Size,Color,Pattern,Material
Medium,RED,Striped,Cotton
Large,BLUE,Polka dot,Wool
Small,Green,Argyle,Bamboo
Medium,Red,Checkered,Cotton
Large,Blue,Graphic,Polyester
Large,Green,Polka dot,123
Medium,Purple,Polka dot,Spandex
Large,Red,Striped,Wool


In [0]:
df['Color'] = df['Color'].str.capitalize()
display(df)

Size,Color,Pattern,Material
Medium,Red,Striped,Cotton
Large,Blue,Polka dot,Wool
Small,Green,Argyle,Bamboo
Medium,Red,Checkered,Cotton
Large,Blue,Graphic,Polyester
Large,Green,Polka dot,123
Medium,Purple,Polka dot,Spandex
Large,Red,Striped,Wool


In [0]:
df['Material'] = df['Material'].apply(lambda x: "Unknown" if not x.isalpha() else
x)
display(df)

Size,Color,Pattern,Material
Medium,Red,Striped,Cotton
Large,Blue,Polka dot,Wool
Small,Green,Argyle,Bamboo
Medium,Red,Checkered,Cotton
Large,Blue,Graphic,Polyester
Large,Green,Polka dot,Unknown
Medium,Purple,Polka dot,Spandex
Large,Red,Striped,Wool


In [0]:
# Check current data types
print(df.dtypes)
# Assume the user has decided the current types are correct; otherwise, convert as necessary
# Example: Convert 'Material' to categorical if it was not
df['Material'] = df['Material'].astype('category')
display(df)

Size        object
Color       object
Pattern     object
Material    object
dtype: object


Size,Color,Pattern,Material
Medium,Red,Striped,Cotton
Large,Blue,Polka dot,Wool
Small,Green,Argyle,Bamboo
Medium,Red,Checkered,Cotton
Large,Blue,Graphic,Polyester
Large,Green,Polka dot,Unknown
Medium,Purple,Polka dot,Spandex
Large,Red,Striped,Wool


In [0]:
df = df.drop_duplicates()

In [0]:
print(df['Material'].unique())
# Assuming 'Unknown' is considered an outlier
df = df[df['Material'] != 'Unknown']

['Cotton', 'Wool', 'Bamboo', 'Polyester', 'Unknown', 'Spandex']
Categories (6, object): ['Bamboo', 'Cotton', 'Polyester', 'Spandex', 'Unknown', 'Wool']


In [0]:
spark_df = spark.createDataFrame(df)
spark_df.write.parquet('cleaned_sock_data11.parquet', mode="overwrite")

In [0]:
parquet_df = spark.read.parquet("/cleaned_sock_data11.parquet")
display(parquet_df)

Size,Color,Pattern,Material
Medium,Red,Striped,Cotton
Medium,Red,Checkered,Cotton
Large,Blue,Graphic,Polyester
Medium,Purple,Polka dot,Spandex
Large,Red,Striped,Wool
Large,Blue,Polka dot,Wool
Small,Green,Argyle,Bamboo


In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, initcap, when, lower, lit, count
# Example DataFrame creation
data = [("Medium", "Red", "Striped", "Cotton"),
("Large", "", "Polka dot", "123"),
(None, "Blue", None, "Wool"),
("Small", "Green", "Argyle", "Bamboo"),
("Medium", "Red", "Checkered", "Cotton"),
("Large", "Blue", "Graphic", "Polyester"),
("Small", "", "Solid", "Nylon"),
(None, "Green", None, "123"),
("Medium", "Purple", "Polka dot", "Spandex"),
("Large", "Red", "Striped", "Wool")]
schema = ["Size", "Color", "Pattern", "Material"]
df = spark.createDataFrame(data, schema=schema)
# Display Data
df.show()
# Fixing Null Values
mode_size = df.groupBy("Size").count().orderBy("count", ascending=False).first()["Size"]
mode_pattern = df.groupBy("Pattern").count().orderBy("count", ascending=False).first()["Pattern"]
df = df.withColumn("Size", when(col("Size").isNull(), mode_size).otherwise(col("Size")))
df = df.withColumn("Pattern", when(col("Pattern").isNull(), mode_pattern).otherwise(col("Pattern")))
# Standardizing Text Fields
df = df.withColumn("Color", initcap(col("Color")))
# Removing Blank Entries
df = df.filter(col("Color") != "")
# Correcting Invalid Material Entries
df = df.withColumn("Material", when(col("Material").rlike("^[0-9]+$"),
lit("Unknown")).otherwise(col("Material")))
# Schema Validation
df.printSchema() # Assuming schema is as intended, otherwise, cast types as necessary
# Deduplication
df = df.dropDuplicates()
#Handling Outliers
# Remove 'Unknown' considered as outlier for illustration
df = df.filter(col("Material") != "Unknown")
# Export Cleaned Data
# Save to Parquet
df.write.parquet("cleaned_data_spark<number>.parquet", mode="overwrite")
# Data Integrity Checks
# Show the cleaned DataFrame and verify no nulls or empty strings
df.show()
df.select([count(when(col(c).isNull() | (col(c) == ""), c)).alias(c) for c in
df.columns]).show()
display(df)

+------+------+---------+---------+
|  Size| Color|  Pattern| Material|
+------+------+---------+---------+
|Medium|   Red|  Striped|   Cotton|
| Large|      |Polka dot|      123|
|  NULL|  Blue|     NULL|     Wool|
| Small| Green|   Argyle|   Bamboo|
|Medium|   Red|Checkered|   Cotton|
| Large|  Blue|  Graphic|Polyester|
| Small|      |    Solid|    Nylon|
|  NULL| Green|     NULL|      123|
|Medium|Purple|Polka dot|  Spandex|
| Large|   Red|  Striped|     Wool|
+------+------+---------+---------+

root
 |-- Size: string (nullable = true)
 |-- Color: string (nullable = true)
 |-- Pattern: string (nullable = true)
 |-- Material: string (nullable = true)

+------+------+---------+---------+
|  Size| Color|  Pattern| Material|
+------+------+---------+---------+
|Medium|   Red|  Striped|   Cotton|
|Medium|  Blue|Polka dot|     Wool|
| Small| Green|   Argyle|   Bamboo|
| Large|  Blue|  Graphic|Polyester|
|Medium|   Red|Checkered|   Cotton|
|Medium|Purple|Polka dot|  Spandex|
| Large|   Re

Size,Color,Pattern,Material
Medium,Red,Striped,Cotton
Medium,Blue,Polka dot,Wool
Small,Green,Argyle,Bamboo
Large,Blue,Graphic,Polyester
Medium,Red,Checkered,Cotton
Medium,Purple,Polka dot,Spandex
Large,Red,Striped,Wool
