In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Titanic").getOrCreate()

In [None]:
from itertools import chain
from pyspark.sql import functions as F
from IPython.core.interactiveshell import InteractiveShell
from functools import reduce
import pandas as pd

InteractiveShell.ast_node_interactivity = "all"

In [None]:
df1 = spark.read.csv(
    "/FileStore/tables/titanic_train.csv", header=True, inferSchema=True
).cache()

In [None]:
df1.select("Survived", "Pclass", "Age", "Fare").summary().show()

In [None]:
(df1.count(), len(df1.columns))
count_col = ["Sex", "Pclass", "SibSp", "Parch", "Embarked"]
df_count = reduce(
    lambda df1, df2: df1.join(df2, on="Survived", how="left"),
    [
        df1.groupBy("Survived").count(),
        df1.groupBy("Survived").mean("Fare", "Age"),
    ]
    + [df1.groupBy("Survived").pivot(i).count() for i in count_col],
).toPandas()

In [None]:
def count_null(df):
    null_list = []
    for col in df.columns:
        null_list.append((col, df.filter(df[col].isNull()).count()))
    null_df = pd.DataFrame(null_list, columns=["feature", "null_count"])
    return null_df


count_null(df1)

In [None]:
df1.select("Fare", "Embarked", "Age").summary("mean", "50%", "max").show()

In [None]:
df1 = df1.fillna({"Fare": 14.45, "Embarked": "S"})

In [None]:
df1 = df1.withColumn("Title", F.regexp_extract(df1["Name"], "([A-Za-z]+)\.", 1))
df1.groupBy("Title").agg(F.count("Age"), F.mean("Age")).sort("count(Age)").show()

In [None]:
title_dic = {'Mr': 'Mr', 'Miss': 'Miss', 'Mrs': 'Mrs', 'Master': 'Master',
             'Mlle': 'Miss', 'Major': 'Mr', 'Col': 'Mr', 'Sir': 'Mr',
             'Don': 'Mr', 'Mme': 'Miss', 'Jonkheer': 'Mr', 'Lady': 'Mrs',
             'Capt': 'Mr', 'Countess': 'Mrs', 'Ms': 'Miss', 'Dona': 'Mrs',
             'Dr': 'Mr', 'Rev': 'Mr'}
mapping = F.create_map([F.lit(i) for i in chain(*title_dic.items())])
df1 = df1.withColumn('Title', mapping[df1['Title']])

In [None]:
def age_fillna(df, title, age):
    return df.withColumn(
        "Age",
        F.when((df["Age"].isNull()) & (df["Title"] == title), age).otherwise(df["Age"]),
    )

In [None]:
age_fillna_dict = (
    df1.groupBy("Title")
    .mean("Age")
    .toPandas()
    .set_index("Title")
    .to_dict()
    .get("avg(Age)")
)
for i, j in age_fillna_dict.items():
    df1 = age_fillna(df1, i, j)

In [None]:
df1 = df1.withColumn("FamliySize", df1["SibSp"] + df1["Parch"]).drop("Parch", "SibSp")

In [None]:
df1 = df1.drop("PassengerID", "Cabin", "Name", "Ticket", "Title")
df1.show(5)

In [None]:
count_null(df1)

Model

In [None]:
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import (
    LogisticRegression,
    RandomForestClassifier,
    GBTClassifier,
)
from pyspark.ml.evaluation import (
    MulticlassClassificationEvaluator,
    BinaryClassificationEvaluator,
)
from pyspark.ml import Pipeline
from pyspark.ml import tuning as tune

In [None]:
df1.show(5)

In [None]:
strIndexCols = ["Sex", "Embarked"]
stringIndex = StringIndexer(
    inputCols=strIndexCols, outputCols=[i + "_indexed" for i in strIndexCols]
)
try:
    stringIndex_model = stringIndex.fit(df1)
except:
    df1 = df1.drop(*[i for i in df1.columns if i.endswith("indexed")])
    stringIndex_model = stringIndex.fit(df1)

In [None]:
df1 = stringIndex_model.transform(df1).drop(*strIndexCols)

In [None]:
df1.show(5)

In [None]:
vec_asm = VectorAssembler(
    inputCols=[i for i in df1.columns if i != "Survived"], outputCol="features"
)
df1_ = vec_asm.transform(df1).select("features", "Survived")

In [None]:
df1_.show(5, truncate=False)

In [None]:
train, test = df1_.randomSplit([0.7, 0.3])

In [None]:
train.show(5, truncate=False)

In [None]:
evaluator = BinaryClassificationEvaluator(
    labelCol="Survived", metricName="areaUnderROC"
)

In [None]:
ridge = LogisticRegression(
    labelCol="Survived", maxIter=100, elasticNetParam=0, regParam=0.03  # ridge
)
model = ridge.fit(train)
pred_test = model.transform(test)
evaluator.evaluate(pred_test)

In [None]:
lasso = LogisticRegression(
    labelCol="Survived", maxIter=100, elasticNetParam=1, regParam=0.0003  # lasso
)
model = lasso.fit(train)
pred_test = model.transform(test)
evaluator.evaluate(pred_test)

In [None]:
rdf = RandomForestClassifier(labelCol="Survived", numTrees=100, maxDepth=3)
rdf_model = rdf.fit(train)
pred = rdf_model.transform(test)
evaluator.evaluate(pred)

In [None]:
gbt = GBTClassifier(labelCol="Survived", maxIter=75, maxDepth=3)
gbt_model = gbt.fit(train)
pred = gbt_model.transform(test)
evaluator.evaluate(pred)

预测

In [None]:
df2 = spark.read.csv(
    "/FileStore/tables/titanic_test.csv", header=True, inferSchema=True
).cache()

In [None]:
df2 = (
    df2.fillna({"Fare": 14.45, "Embarked": "S"})
    .withColumn("FamliySize", df2["SibSp"] + df2["Parch"])
    .drop("Parch", "SibSp")
    .withColumn("Title", F.regexp_extract(df2["Name"], "([A-Za-z]+)\.", 1))
)
df2 = df2.withColumn("Title", mapping[df2["Title"]])
for i, j in age_fillna_dict.items():
    df2 = age_fillna(df2, i, j)
df2 = df2.drop("Cabin", "Name", "Ticket", "Title")

In [None]:
pipeline_rdf = Pipeline(stages=[stringIndex, vec_asm, rdf])

In [None]:
paramGrid = (
    tune.ParamGridBuilder()
    .addGrid(rdf.maxDepth, [3, 4, 5])
    .addGrid(rdf.minInfoGain, [0.0, 0.01, 0.1])
    .addGrid(rdf.numTrees, [1000])
    .build()
)
cv_model = tune.CrossValidator(
    estimator=pipeline_rdf,
    estimatorParamMaps=paramGrid,
    evaluator=evaluator,
    numFolds=5,
)

In [None]:
df1 = spark.read.csv(
    "/FileStore/tables/titanic_train.csv", header=True, inferSchema=True
).cache()
df1 = (
    df1.fillna({"Fare": 14.45, "Embarked": "S"})
    .withColumn("FamliySize", df1["SibSp"] + df1["Parch"])
    .drop("Parch", "SibSp")
    .withColumn("Title", F.regexp_extract(df1["Name"], "([A-Za-z]+)\.", 1))
)
df1 = df1.withColumn("Title", mapping[df1["Title"]])
for i, j in age_fillna_dict.items():
    df1 = age_fillna(df1, i, j)
df1 = df1.drop("Cabin", "Name", "Ticket", "Title")

In [None]:
model_best = cv_model.fit(df1)

In [None]:
pred_train = model_best.transform(df1)

In [None]:
evaluator.evaluate(pred_train)

In [None]:
pred_test = model_best.transform(df2)

In [None]:
result = pred_test.select("PassengerId", "prediction")

In [None]:
result = result.withColumn("Survived", result["prediction"].cast("integer")).drop(
    "prediction"
)

In [None]:
model_best.write().save("titanic_rdf.model")

In [None]:
result.toPandas().to_csv("submission.csv")

In [None]:
!pwd

In [None]:
!pwd