In [1]:
import numpy as np
import pandas as pd

dfs = []
for i in range(1, 9):
    filename = f"train-{i}.csv"
    df = pd.read_csv(filename)
    dfs.append(df)

In [2]:
combined_df = pd.concat(dfs, ignore_index=True)

combined_df.replace("\\N", np.nan, inplace=True)

# to numerical
numerical_columns = ['startYear', 'endYear', 'runtimeMinutes', 'numVotes']
combined_df[numerical_columns] = combined_df[numerical_columns].apply(pd.to_numeric, errors='coerce')
# to bool
combined_df['label'] = combined_df['label'].map({'True': True, 'False': False})

print(combined_df)

      Unnamed: 0     tconst                                   primaryTitle  \
0              4  tt0010600                                       The Doll   
1              7  tt0011841                                  Way Down East   
2              9  tt0012494                                        Déstiny   
3             25  tt0015163                                  The Navigator   
4             38  tt0016220                       The Phantom of the Opera   
...          ...        ...                                            ...   
7954        9966  tt9625664                                  Trauma Center   
7955        9981  tt9741310                                          Slaxx   
7956        9982  tt9742392                                        Kindred   
7957        9996  tt9850386  The Bee Gees: How Can You Mend a Broken Heart   
7958        9999  tt9911196                            The Marriage Escape   

                       originalTitle  startYear  endYear  runti

In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when
from pyspark.sql.types import IntegerType, BooleanType

spark = SparkSession.builder \
    .appName("data cleaning") \
    .getOrCreate()

files = ["train-1.csv", "train-2.csv", "train-3.csv", "train-4.csv",
              "train-5.csv", "train-6.csv", "train-7.csv", "train-8.csv"]

df = spark.read.csv(files, header=True)
df = df.drop(df.columns[0])

for col_name in df.columns:
    df = df.withColumn(col_name, 
                       when(col(col_name) == "\\N", None).otherwise(col(col_name)))

numerical_columns = ['startYear', 'endYear', 'runtimeMinutes', 'numVotes']
for col_name in numerical_columns:
    df = df.withColumn(col_name, df[col_name].cast(IntegerType()))

df = df.withColumn("label", df["label"].cast(BooleanType()))
df.show()

spark.stop()

+---------+--------------------+----------------+---------+-------+--------------+--------+-----+
|   tconst|        primaryTitle|   originalTitle|startYear|endYear|runtimeMinutes|numVotes|label|
+---------+--------------------+----------------+---------+-------+--------------+--------+-----+
|tt0014109|The Saga of Gösta...|            NULL|     1924|   NULL|           183|    1231| true|
|tt0015064|      The Last Laugh| Der letzte Mann|     1924|   NULL|            77|    NULL| true|
|tt0015841|        The Freshman|    The Freshman|     1925|   NULL|            77|    5374| true|
|tt0017271|          By the Law|            NULL|     NULL|   1926|            80|    1057| true|
|tt0018451|The Student Princ...|            NULL|     1927|   NULL|           106|    1459| true|
|tt0018742|       The Cameraman|   The Cameraman|     1928|   NULL|            76|   11388| true|
|tt0019379|         Show People|            NULL|     1928|   NULL|            83|    3695| true|
|tt0020018|      In 