In [1]:
import findspark
findspark.init('/home/ubuntu/spark-2.1.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Iteration 4').getOrCreate()

In [2]:
from pyspark.sql.types import (StructField,StringType,IntegerType,FloatType,StructType)

data_schema = [StructField('age',IntegerType(),True),
              StructField('gender',IntegerType(),True),
              StructField('height',IntegerType(),True),
              StructField('weight',FloatType(),True),
              StructField('blood pressure',IntegerType(),True),
              StructField('cholesterol',IntegerType(),True),
              StructField('gluc',IntegerType(),True),
              StructField('smoke',IntegerType(),True),
              StructField('alco',IntegerType(),True),
              StructField('active',IntegerType(),True),
              StructField('cardio',IntegerType(),True)]

final_struct = StructType(fields=data_schema)

df = spark.read.load("./new_train.csv", format="csv", header="true", schema=final_struct)

###### 4.1 Reduce the data

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

pdf = pd.read_csv("./new_train.csv")

corr = pdf.corr(method='pearson')
cmap = sns.diverging_palette(220, 10, as_cmap=True)
# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,annot = True,
             square=True, linewidths=.5, cbar_kws={"shrink": .5});

In [4]:
df = df.drop('gender','height','smoke','alco','active')
df.show()

+---+------+--------------+-----------+----+------+
|age|weight|blood pressure|cholesterol|gluc|cardio|
+---+------+--------------+-----------+----+------+
| 50|  62.0|             3|          1|   1|     0|
| 55|  85.0|             4|          3|   1|     1|
| 52|  64.0|             3|          3|   1|     1|
| 48|  82.0|             4|          1|   1|     1|
| 60|  67.0|             3|          2|   2|     0|
| 61|  93.0|             3|          3|   1|     0|
| 62|  95.0|             3|          3|   3|     1|
| 48|  71.0|             1|          1|   1|     0|
| 62|  80.0|             3|          1|   1|     0|
| 52|  60.0|             3|          1|   1|     0|
| 41|  60.0|             3|          1|   1|     0|
| 54|  78.0|             1|          1|   1|     0|
| 40|  95.0|             3|          1|   1|     0|
| 58|  75.0|             3|          1|   1|     0|
| 46|  52.0|             1|          1|   3|     0|
| 48|  68.0|             1|          1|   1|     0|
| 60|  56.0|

###### 4.2 Project the data

In [5]:
pdf["cardio"].value_counts()

0    31634
1    30868
Name: cardio, dtype: int64

In [6]:
g = pdf.groupby("cardio")
balanced = pd.DataFrame(g.apply(lambda x: x.sample(g.size().min()).reset_index(drop=True)))

balanced["cardio"].value_counts()

1    30868
0    30868
Name: cardio, dtype: int64

In [7]:
balanced.to_csv("balanced.csv", encoding='utf-8', index=False)
balanced = pd.read_csv("./balanced.csv")