In [1]:
spark

In [2]:
taskEventsDF = sqlContext.table("task_events_csv").drop('_c1', '_c6', '_c12')
#taskEventsDF = sqlContext.table("task_events_csv").drop('_c1', '_c6', '_c12').filter("_c0 > 600 * 1000000").filter("_c0 < 9223372036854775807")

In [3]:
display(taskEventsDF)

In [4]:
jobTaskNumDF = taskEventsDF.select('_c2', '_c3').distinct().groupBy('_c2').count().withColumnRenamed('count', 'tasknum').groupBy('tasknum').count().withColumnRenamed('count', 'jobnum')

In [5]:
display(jobTaskNumDF)

In [6]:
jobTaskNumPDDF = jobTaskNumDF.toPandas()
jobTaskNumPDDF.describe()

In [7]:
jobTaskNumPDDF = jobTaskNumPDDF.sort(columns='tasknum')
jobTaskNumPDDF

In [8]:
import matplotlib.pyplot as plt

plt.clf()
jobTaskNumFig = plt.figure()
ax = plt.gca()
ax.stem(jobTaskNumPDDF.tasknum, jobTaskNumPDDF.jobnum, markerfmt=' ')
ax.set_xscale('log')
ax.set_yscale('log')
ax.set_xlabel('Task Count')
ax.set_ylabel('Number of Jobs')
display(plt.show())

In [9]:
from pyspark.sql.functions import mean
from pyspark.sql.functions import round
jobMemReqDF = taskEventsDF.select('_c2', '_c10').groupBy('_c2').agg(round(mean('_c10'), 4)).withColumnRenamed('round(avg(_c10), 4)', 'MemReq').groupBy('MemReq').count().withColumnRenamed('count', 'jobnum')

In [10]:
display(jobMemReqDF)

In [11]:
jobMemReqPDDF = jobMemReqDF.toPandas()
# jobMemReqPDDF.describe()

In [12]:
jobMemReqPDDF = jobMemReqPDDF.sort(columns='MemReq')
plt.clf()
jobMemReqFig = plt.figure()
ax = plt.gca()
ax.stem(jobMemReqPDDF.MemReq, jobMemReqPDDF.jobnum, markerfmt=' ')
ax.set_yscale('log')
ax.set_xlabel('Memory Request (per task)')
ax.set_ylabel('Number of Jobs')
display(plt.show())

In [13]:
jobCPUReqDF = taskEventsDF.select('_c2', '_c9').groupBy('_c2').agg(round(mean('_c9'), 4)).withColumnRenamed('round(avg(_c9), 4)', 'CPUReq').groupBy('CPUReq').count().withColumnRenamed('count', 'jobnum')
jobCPUReqPDDF = jobCPUReqDF.toPandas()
jobCPUReqPDDF = jobCPUReqPDDF.sort(columns='CPUReq')

In [14]:
plt.clf()
jobCPUReqFig = plt.figure()
ax = plt.gca()
ax.stem(jobCPUReqPDDF.CPUReq, jobCPUReqPDDF.jobnum, markerfmt=' ')
ax.set_yscale('log')
ax.set_xlabel('CPU Request (per task)')
ax.set_ylabel('Number of Jobs')
display(plt.show())

In [15]:
taskUsage1DF = sqlContext.table("task_usage_1_csv").select('_c2', '_c3', '_c10', '_c13')
taskUsage2DF = sqlContext.table("task_usage_2_csv").select('_c2', '_c3', '_c10', '_c13')
taskUsage3DF = sqlContext.table("task_usage_3_csv").select('_c2', '_c3', '_c10', '_c13')
taskUsage4DF = sqlContext.table("task_usage_4_csv").select('_c2', '_c3', '_c10', '_c13')
taskUsage5DF = sqlContext.table("task_usage_5_csv").select('_c2', '_c3', '_c10', '_c13')
taskUsage6DF = sqlContext.table("task_usage_6_csv").select('_c2', '_c3', '_c10', '_c13')
taskUsage7DF = sqlContext.table("task_usage_7_csv").select('_c2', '_c3', '_c10', '_c13')
taskUsage8DF = sqlContext.table("task_usage_8_csv").select('_c2', '_c3', '_c10', '_c13')
taskUsage9DF = sqlContext.table("task_usage_9_csv").select('_c2', '_c3', '_c10', '_c13')
taskUsage10DF = sqlContext.table("task_usage_10_csv").select('_c2', '_c3', '_c10', '_c13')

taskUsageDF = taskUsage1DF.union(taskUsage2DF).union(taskUsage3DF).union(taskUsage4DF).union(taskUsage5DF).union(taskUsage6DF).union(taskUsage7DF).union(taskUsage8DF).union(taskUsage9DF).union(taskUsage10DF)

taskUsageDF = taskUsageDF.withColumnRenamed('_c2', 'Job Id').withColumnRenamed('_c3', 'Task Index').withColumnRenamed('_c10', 'Max Mem').withColumnRenamed('_c13', 'Max CPU')

In [16]:
display(taskUsageDF.limit(5))

In [17]:
from pyspark.sql.functions import max
jobMemMaxReqDF = taskEventsDF.select('_c2', '_c10').groupBy('_c2').agg(max('_c10')).withColumnRenamed('max(_c10)', 'MaxMemRequest').withColumnRenamed('_c2', 'Job Id')
jobMemUsageDF = taskUsageDF.groupBy('Job Id').agg(max('Max Mem')).withColumnRenamed('max(Max Mem)', 'MaxMemUsage')
jobMemPortionDF = jobMemUsageDF.join(jobMemMaxReqDF, 'Job Id', 'outer').na.drop()

In [18]:
display(jobMemPortionDF.limit(5))

In [19]:
jobMemPortionDF = jobMemPortionDF.withColumn('portion', round(jobMemPortionDF.MaxMemUsage / jobMemPortionDF.MaxMemRequest, 3)).select('Job Id', 'portion').groupBy('portion').count().withColumnRenamed('count', 'jobnum')

In [20]:
display(jobMemPortionDF.limit(5))

In [21]:
jobMemPortionPDDF = jobMemPortionDF.toPandas().dropna().sort(columns='portion')

In [22]:
jobMemPortionPDDF.dtypes

In [23]:
plt.clf()
jobMemPortionFig = plt.figure()
ax = plt.gca()
ax.stem(jobMemPortionPDDF.portion, jobMemPortionPDDF.jobnum, markerfmt=' ')
ax.set_xscale('log')
ax.set_yscale('log')
ax.set_xlabel('Portion of Memory Request Used (per task)')
ax.set_ylabel('Number of Jobs')
display(plt.show())

In [24]:
jobMemSmallPortionDF = jobMemPortionDF.na.drop().filter("portion < 2")
jobMemSmallPortionPDDF = jobMemSmallPortionDF.toPandas().sort(columns='portion')

plt.clf()
jobMemSmallPortionFig = plt.figure()
ax = plt.gca()
ax.stem(jobMemSmallPortionPDDF.portion, jobMemSmallPortionPDDF.jobnum, markerfmt=' ')
ax.set_yscale('log')
ax.set_xlabel('Portion of Memory Request Used (per task)')
ax.set_ylabel('Number of Jobs')
display(plt.show())

In [25]:
jobCPUMaxReqDF = taskEventsDF.select('_c2', '_c9').groupBy('_c2').agg(max('_c9')).withColumnRenamed('max(_c9)', 'MaxCPURequest').withColumnRenamed('_c2', 'Job Id')
jobCPUUsageDF = taskUsageDF.groupBy('Job Id').agg(max('Max CPU')).withColumnRenamed('max(Max CPU)', 'MaxCPUUsage')
jobCPUPortionDF = jobCPUUsageDF.join(jobCPUMaxReqDF, 'Job Id', 'outer').na.drop()
jobCPUPortionDF = jobCPUPortionDF.withColumn('portion', round(jobCPUPortionDF.MaxCPUUsage / jobCPUPortionDF.MaxCPURequest, 3)).select('Job Id', 'portion').groupBy('portion').count().withColumnRenamed('count', 'jobnum')
jobCPUPortionPDDF = jobCPUPortionDF.toPandas().dropna().sort(columns='portion')
plt.clf()
jobCPUPortionFig = plt.figure()
ax = plt.gca()
ax.stem(jobCPUPortionPDDF.portion, jobCPUPortionPDDF.jobnum, markerfmt=' ')
ax.set_yscale('log')
ax.set_xscale('log')
ax.set_xlabel('Portion of CPU Request Used (per task)')
ax.set_ylabel('Number of Jobs')
display(plt.show())