In [1]:
import pyspark
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import split
from pyspark import SparkContext, SparkConf     

In [2]:
conf = SparkConf()
sc = SparkContext(conf=conf)

In [3]:
spark = SparkSession \
    .builder \
    .appName("CSV with conditions") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

In [4]:
char = spark.read.text("dataset/character.metadata.tsv")
meta=spark.read.text("dataset/movie.metadata.tsv")
name=spark.read.text("dataset/name.clusters.txt")
plot = spark.read.text("dataset/plot_summaries.txt")
tv = spark.read.text("dataset/tvtropes.clusters.txt")

In [None]:
sc.setLogLevel("WARN")
#setup the same way you have it
log_txt=sc.textFile("dataset/character.metadata.tsv")
header = log_txt.first()

#filter out the header, make sure the rest looks correct
log_txt = log_txt.filter(lambda line: line != header)

temp_var = log_txt.map(lambda k: k.split("\\t"))

#here's where the changes take place
#this creates a dataframe using whatever pyspark feels like using (I think string is the default). the header.split is providing the names of the columns
log_df=temp_var.toDF(header.split("\\t"))
log_df.show()

In [5]:
char = spark.read.text("dataset/character.metadata.tsv")
char.printSchema()
char.show()
char.selectExpr("split(value, '\t') as columns").show(3, False)

root
 |-- value: string (nullable = true)

+--------------------+
|               value|
+--------------------+
|975900\t/m/03vyhn...|
|975900\t/m/03vyhn...|
|975900\t/m/03vyhn...|
|975900\t/m/03vyhn...|
|975900\t/m/03vyhn...|
|975900\t/m/03vyhn...|
|975900\t/m/03vyhn...|
|975900\t/m/03vyhn...|
|975900\t/m/03vyhn...|
|975900\t/m/03vyhn...|
|975900\t/m/03vyhn...|
|975900\t/m/03vyhn...|
|975900\t/m/03vyhn...|
|975900\t/m/03vyhn...|
|975900\t/m/03vyhn...|
|975900\t/m/03vyhn...|
|975900\t/m/03vyhn...|
|3196793\t/m/08yl5...|
|3196793\t/m/08yl5...|
|3196793\t/m/08yl5...|
+--------------------+
only showing top 20 rows

+------------------------------------------------------------------------------------------------------------------------------------------------------+
|columns                                                                                                                                               |
+------------------------------------------------------------------------

In [11]:
char = spark.read.option("header", "true")\
    .option("delimiter", '\t')\
    .option("inferSchema", "false") \
    .text("dataset/character.metadata.tsv")
char.show()

+--------------------+
|               value|
+--------------------+
|975900\t/m/03vyhn...|
|975900\t/m/03vyhn...|
|975900\t/m/03vyhn...|
|975900\t/m/03vyhn...|
|975900\t/m/03vyhn...|
|975900\t/m/03vyhn...|
|975900\t/m/03vyhn...|
|975900\t/m/03vyhn...|
|975900\t/m/03vyhn...|
|975900\t/m/03vyhn...|
|975900\t/m/03vyhn...|
|975900\t/m/03vyhn...|
|975900\t/m/03vyhn...|
|975900\t/m/03vyhn...|
|975900\t/m/03vyhn...|
|975900\t/m/03vyhn...|
|975900\t/m/03vyhn...|
|3196793\t/m/08yl5...|
|3196793\t/m/08yl5...|
|3196793\t/m/08yl5...|
+--------------------+
only showing top 20 rows



In [8]:
split_col = pyspark.sql.functions.split(df2['value'], '\t')
df2 = df2.withColumn('NAME1', split_col.getItem(0))
df2 = df2.withColumn('NAME2', split_col.getItem(1))

df2.show(10,truncate=False)

+----------------------------------------------------------------------------------------------------------------------------------------------------+------+---------+
|value                                                                                                                                               |NAME1 |NAME2    |
+----------------------------------------------------------------------------------------------------------------------------------------------------+------+---------+
|975900\t/m/03vyhn\t2001-08-24\tAkooshay\t1958-08-26\tF\t1.62\t\tWanda De Jesus\t42\t/m/0bgchxw\t/m/0bgcj3x\t/m/03wcfv7                              |975900|/m/03vyhn|
|975900\t/m/03vyhn\t2001-08-24\tLieutenant Melanie Ballard\t1974-08-15\tF\t1.78\t/m/044038p\tNatasha Henstridge\t27\t/m/0jys3m\t/m/0bgchn4\t/m/0346l4|975900|/m/03vyhn|
|975900\t/m/03vyhn\t2001-08-24\tDesolation Williams\t1969-06-15\tM\t1.727\t/m/0x67\tIce Cube\t32\t/m/0jys3g\t/m/0bgchn_\t/m/01vw26l                  |975900|/m/