In [8]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession, Window, Row
from pyspark.sql.window import Window
from pyspark.sql import functions as F
from pyspark.sql import types as T
import pandas as pd

In [3]:
spark = SparkSession.builder.appName('covid19').getOrCreate()

In [4]:
result = spark.read\
    .option("encoding", "UTF-8")\
    .option("delimiter", ",")\
    .option("parserLib", "univocity")\
    .option("multiLine", "true")\
    .option("escape", "\"")\
    .parquet("word_count.parquet")

In [5]:
result2 = result.groupby('word')\
    .agg(F.sum('count').alias('count'))\
    .orderBy(F.desc('count'))

In [6]:
words = [row[0] for row in result2.limit(1000).select('word').collect()]
if 'date' in words:
    words.remove('date')

In [121]:
result3 = result.groupBy('date').pivot('word', words).max('count').orderBy('date').fillna(0)

In [26]:
df_cases = spark.read.csv('covid_numconf.csv', header=True)

In [29]:
df_cases = df_cases.select(
        F.from_unixtime(F.unix_timestamp('date', 'yyyy-MM-dd')).cast('timestamp').alias('date'),
        F.col('numconf').cast('Long'),
        F.lit(1).alias('temp')
    )

window = Window.partitionBy('temp').orderBy('date')

df_cases = df_cases.withColumn("numconf_lead3", (F.lead('numconf', 3).over(window)))
df_cases = df_cases.withColumn("numconf_lead7", (F.lead('numconf', 7).over(window)))
df_cases = df_cases.withColumn("numnew", (F.col('numconf') - F.lag('numconf', 1).over(window)))
df_cases = df_cases.withColumn("numnew_lead3", (F.lead('numnew', 3).over(window)))
df_cases = df_cases.withColumn("numnew_lead7", (F.lead('numnew', 7).over(window)))

df_cases = df_cases.select(
    'date',
    'numconf',
    'numconf_lead3',
    'numconf_lead7',
    'numnew',
    'numnew_lead3',
    'numnew_lead7'
)

In [30]:
df_cases.toPandas()

Unnamed: 0,date,numconf,numconf_lead3,numconf_lead7,numnew,numnew_lead3,numnew_lead7
0,2020-03-01,24,39.0,62.0,,6.0,5.0
1,2020-03-02,28,45.0,77.0,4.0,6.0,15.0
2,2020-03-03,33,51.0,90.0,5.0,6.0,13.0
3,2020-03-04,39,57.0,103.0,6.0,6.0,13.0
4,2020-03-05,45,62.0,138.0,6.0,5.0,35.0
5,2020-03-06,51,77.0,176.0,6.0,15.0,38.0
6,2020-03-07,57,90.0,193.0,6.0,13.0,17.0
7,2020-03-08,62,103.0,249.0,5.0,13.0,56.0
8,2020-03-09,77,138.0,324.0,15.0,35.0,75.0
9,2020-03-10,90,176.0,424.0,13.0,38.0,100.0


In [31]:
result4 = result3.join(df_cases, on='date', how='left')

NameError: name 'result3' is not defined

In [137]:
result4.stat.corr("coronaviru", "numconf")

nan

In [None]:
result4.persist()

In [None]:
result4.write\
    .option("encoding", "UTF-8")\
    .option("delimiter", ",")\
    .option("parserLib", "univocity")\
    .mode("overwrite")\
    .parquet("word_count_pivot.parquet")

In [140]:
sc = SparkContext.getOrCreate()

In [None]:
corr_coefs_new = [result4.stat.corr(w, 'numnew') for w in words]

In [None]:
corr_coefs_new_lead3 = [result4.stat.corr(w, 'numnew_lead3') for w in words]

In [None]:
corr_coefs_new_lead7 = [result4.stat.corr(w, 'numnew_lead7') for w in words]

In [2]:
rdd = sc.parallelize(zip(words, corr_coefs_new, corr_coefs_new_lead3, corr_coefs_new_lead7))
corr_rdd = rdd.map(lambda x: Row(word=x[0], corr_new=x[1], corr_new_lead3=x[2], corr_new_lead7=x[3]))
corr_df = spark.createDataFrame(corr_rdd)

NameError: name 'sc' is not defined

In [None]:
corr_df.toPandas().to_csv("word_correlation_new_case.csv", index=False)