In [85]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
spark = SparkSession.builder.appName("cse6250group").getOrCreate()


In [86]:
df = spark.read.csv('model data.csv', header=True,schema=StructType([
            StructField('recording', LongType()),
            StructField('Source', StringType()),
            StructField('age', LongType()),
            StructField('sex', StringType()),
            StructField('diagnosis', StringType()),
            StructField('other problems', StringType()),
            StructField('epoches', LongType()),
            StructField('w%', DoubleType()),
            StructField('n1%', DoubleType()),
            StructField('n2%', DoubleType()),
            StructField('n3%', DoubleType()),
            StructField('rem%', DoubleType())]))

In [87]:
df.show()

+---------+-------+---+---+--------------------+--------------------+-------+-----+-----+-----+-----+-----+
|recording| Source|age|sex|           diagnosis|      other problems|epoches|   w%|  n1%|  n2%|  n3%| rem%|
+---------+-------+---+---+--------------------+--------------------+-------+-----+-----+-----+-----+-----+
|        1|isruc-1| 64|  M|                SAOS|          Depression|    880| 30.0|  8.3|22.05|26.25|13.41|
|        2|isruc-1| 52|  M|                SAOS|Restless leg synd...|    964|25.41|11.93|35.79|16.29|10.58|
|        3|isruc-1| 38|  M|REM Sleep Behavio...|                PLMS|    943| 14.0| 17.5|26.09|18.35|24.07|
|        4|isruc-1| 27|  M|               SRVAS|           Epilepsy |    963| 2.91| 6.75|44.24|22.22|23.88|
|        5|isruc-1| 58|  F|                SAOS|            Insomnia|    875|33.83|12.34|30.29|18.74|  4.8|
|        6|isruc-1| 22|  M|                PLMS|Epilepsy; brain t...|    897|80.49| 1.78| 6.69|11.04|  0.0|
|        7|isruc-1| 70|  M| 

In [88]:
# RONCOPATIA = snoring
# S. PERNAS INQUIETAS = restless leg syndrome
# SAOS = sleep apnea
# PLMS = Periodic limb movements of sleep
# EPILEPSIA = epilepsy
# PRIVAÇÃO DE SONO = sleep deprivation
## SRVAS, D.Afectiva

In [111]:
#df['diagnosis'] = np.where(df.diagnosis=='F', None, df.diagnosis)
from pyspark.sql.functions import col,when,lower

df = df.withColumn('diagnosis', when(df.diagnosis == "F",None).otherwise(df.diagnosis))

In [112]:
df.select("diagnosis").distinct().show()

+--------------------+
|           diagnosis|
+--------------------+
|    PRIVAÇÃO DE SONO|
|REM Sleep Behavio...|
|                PLMS|
|           EPILEPSIA|
|                SAOS|
|         D. Afectiva|
| S. PERNAS INQUIETAS|
|         Parasomnia |
|          no problem|
|          RONCOPATIA|
|               SRVAS|
+--------------------+



In [113]:
#1hot encoding for gender

df = df.withColumn('is_male', lower(df.sex) == "m")
df = df.withColumn('is_female', lower(df.sex) == "f")
#df.head()
df.head()

Row(recording=1, Source='isruc-1', age=64, sex='M', diagnosis='SAOS', other problems='Depression', epoches=880, w%=30.0, n1%=8.3, n2%=22.05, n3%=26.25, rem%=13.41, is_male=True, is_female=False)

In [114]:
df = df.dropna(subset=['w%','n1%', 'n2%', 'n3%', 'rem%', 'age', 'sex', 'diagnosis'])

In [115]:
df.select("diagnosis").distinct().show()

+--------------------+
|           diagnosis|
+--------------------+
|    PRIVAÇÃO DE SONO|
|REM Sleep Behavio...|
|                PLMS|
|           EPILEPSIA|
|                SAOS|
|         D. Afectiva|
| S. PERNAS INQUIETAS|
|         Parasomnia |
|          no problem|
|          RONCOPATIA|
|               SRVAS|
+--------------------+



In [116]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(
    inputCols=['w%','n1%', 'n2%', 'n3%', 'rem%', 'is_male','is_female'],
    outputCol='features')

transformed_df = assembler.transform(df)

In [109]:
from pyspark.ml.clustering import BisectingKMeans
from pyspark.ml.evaluation import ClusteringEvaluator
# Trains a bisecting k-means model.
bkm = BisectingKMeans().setK(11).setSeed(42)
model = bkm.fit(transformed_df)

# Make predictions
predictions = model.transform(transformed_df)

# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator()

silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))

# Shows the result.
print("Cluster Centers: ")
centers = model.clusterCenters()
for center in centers:
    print(center)

Silhouette with squared euclidean distance = 0.1909549264810547
Cluster Centers: 
[20.54857143 16.02928571 34.32857143 15.08214286 14.01142857  0.71428571
  0.28571429]
[11.08434783 14.84043478 40.08173913 19.84043478 14.15304348  0.47826087
  0.52173913]
[ 9.02333333 17.64555556 49.69333333 11.29666667 12.33888889  0.66666667
  0.33333333]
[10.22866667  6.632      31.10133333 31.45       20.58666667  0.73333333
  0.26666667]
[12.49222222 16.35       28.14777778 24.99777778 18.01444444  0.55555556
  0.44444444]
[21.82846154 10.97384615 30.97769231 24.33153846 11.89076923  0.61538462
  0.38461538]
[47.31833333 14.19       22.98833333  9.24833333  6.25        0.66666667
  0.33333333]
[72.5         5.66666667  7.20666667 13.64333333  0.94666667  0.66666667
  0.33333333]
[33.95466667 15.682      24.39866667 16.73266667  9.23333333  0.6
  0.4       ]
[33.85       13.33142857 33.95285714  9.65428571  9.21        0.57142857
  0.42857143]
[30.651  9.364 21.157 24.668 14.161  0.5    0.5  ]


In [117]:
import mne

NotImplementedError: Only EDF files are supported by read_raw_edf, got rec