In [1]:
import pickle as pckl
import pyspark
from pyspark.sql import SparkSession
import pandas as pd
import numpy as np

spark = SparkSession \
    .builder \
    .appName("osrs")\
    .getOrCreate()

sqlContext = pyspark.sql.SQLContext(spark)

players_xp = pckl.load(open('data/players_xp.pckl', 'rb'))
players_rank = pckl.load(open('data/players_rank.pckl', 'rb'))

In [2]:
columns = ['name', 'date', 'overall', 'attack', 'defence', 'strength', 'hitpoints', 'ranged', 'prayer', 'magic', 
               'cooking', 'woodcutting', 'fletching', 'fishing', 'firemaking', 'crafting', 'smithing', 'mining', 
               'herblore', 'agility', 'thieving', 'slayer', 'farming', 'runecraft', 'hunter', 'construction']

In [3]:
# convert timestamp to unix time
players_xp['date'] = players_xp['date'].astype(np.int64) // 10**9
players_rank['date'] = players_rank['date'].astype(np.int64) // 10**9

In [4]:
xp_df = sqlContext.createDataFrame(players_xp)
rank_df = sqlContext.createDataFrame(players_rank)

In [5]:
sqlContext.registerDataFrameAsTable(xp_df, 'players_xp')
sqlContext.registerDataFrameAsTable(rank_df, 'players_rank')

In [44]:
sqlContext.sql('''SELECT count(DISTINCT date) from players_xp''').collect()

[Row(count(DISTINCT date)=1420)]

In [45]:
sqlContext.sql('''SELECT name, (max - lag) as rate FROM 
(SELECT name,

MAX(overall) OVER (PARTITION BY name) as overall,
LAG(overall) OVER (PARTITION BY name ORDER BY date) as lag
FROM players_xp
LIMIT 5)
 ''').collect()

[Row(name='0x8000FFFF', rate=None),
 Row(name='0x8000FFFF', rate=555756),
 Row(name='18fish1803', rate=None),
 Row(name='18fish1803', rate=0),
 Row(name='1st PureEver', rate=None)]

In [None]:
sqlContext.sql('''SELECT name, (max - lag) as rate FROM 
(SELECT name,
LAG(overall) OVER (PARTITION BY name ORDER BY date) as lag
FROM players_xp
LIMIT 5)
 ''').collect()

In [55]:
sqlContext.sql('''SELECT name, 
ROUND((date - lag_date)/3600,2) as hours 
FROM
(SELECT name, 
date,
LAG(date) OVER (PARTITION by name ORDER BY date) as lag_date
FROM players_xp
LIMIT 5)''').collect()

[Row(name='0x8000FFFF', hours=None),
 Row(name='0x8000FFFF', hours=40.24),
 Row(name='18fish1803', hours=None),
 Row(name='18fish1803', hours=40.67),
 Row(name='1st PureEver', hours=None)]

In [109]:
a = sqlContext.sql('''SELECT
*, date-lag_date as lag
FROM
(SELECT name, 
date, attack, defence, strength, hitpoints, ranged, prayer, magic, cooking, woodcutting, fletching, fishing, 
firemaking, crafting, smithing, mining, herblore, agility, thieving, slayer, farming, runecraft, hunter, construction,
MIN(date) OVER (PARTITION by name ORDER BY date) as min_date,
LAG(date) OVER (PARTITION by name ORDER BY date) as lag_date,
LAG(attack) OVER (PARTITION by name ORDER BY date) AS lag_attack,
LAG(strength) OVER (PARTITION by name ORDER BY date) AS lag_defence,
LAG(defence) OVER (PARTITION by name ORDER BY date) AS lag_strength,
LAG(hitpoints) OVER (PARTITION by name ORDER BY date) AS lag_hitpoints,
LAG(ranged) OVER (PARTITION by name ORDER BY date) AS lag_ranged,
LAG(prayer) OVER (PARTITION by name ORDER BY date) AS lag_prayer,
LAG(magic) OVER (PARTITION by name ORDER BY date) AS lag_magic,
LAG(cooking) OVER (PARTITION by name ORDER BY date) AS lag_cooking,
LAG(woodcutting) OVER (PARTITION by name ORDER BY date) AS lag_woodcutting,
LAG(fletching) OVER (PARTITION by name ORDER BY date) AS lag_fletching,
LAG(fishing) OVER (PARTITION by name ORDER BY date) AS lag_fishing,
LAG(firemaking) OVER (PARTITION by name ORDER BY date) AS lag_firemaking,
LAG(crafting) OVER (PARTITION by name ORDER BY date) AS lag_crafting,
LAG(smithing) OVER (PARTITION by name ORDER BY date) AS lag_smithing,
LAG(mining) OVER (PARTITION by name ORDER BY date) AS lag_mining,
LAG(herblore) OVER (PARTITION by name ORDER BY date) AS lag_herblore,
LAG(agility) OVER (PARTITION by name ORDER BY date) AS lag_agility,
LAG(thieving) OVER (PARTITION by name ORDER BY date) AS lag_thieving,
LAG(slayer) OVER (PARTITION by name ORDER BY date) AS lag_slayer,
LAG(farming) OVER (PARTITION by name ORDER BY date) AS lag_farming,
LAG(runecraft) OVER (PARTITION by name ORDER BY date) AS lag_runecraft,
LAG(hunter) OVER (PARTITION by name ORDER BY date) AS lag_hunter,
LAG(construction) OVER (PARTITION by name ORDER BY date) AS lag_construction
FROM players_xp
LIMIT 5) 
WHERE date != min_date''').collect()

In [137]:
sqlContext.sql('''SELECT name, 3600*magic_rate/timediff as magic_hr FROM(
SELECT a.name, (b.date - a.date) as timediff, (b.magic - a.magic) as magic_rate
FROM players_xp as a
JOIN (SELECT name, date, magic, LAG(date) OVER (PARTITION BY name ORDER BY date) as lag_date from players_xp) b
ON a.name = b.name AND a.date = b.lag_date
ORDER BY magic_rate DESC
LIMIT 5)''').collect()

[Row(name='Piety Viet', magic_hr=114022.49274097905),
 Row(name='Lucifer420', magic_hr=64452.10518571459),
 Row(name='Bowl Chopper', magic_hr=56640.78858013047),
 Row(name='16k Risk', magic_hr=55959.81646747616),
 Row(name='I-Hunt-Nerds', magic_hr=51387.38164277763)]

In [None]:
sqlContext.sql('''SELECT name, 
date, attack, defence, strength, hitpoints, ranged, prayer, magic, cooking, woodcutting, fletching, fishing, 
firemaking, crafting, smithing, mining, herblore, agility, thieving, slayer, farming, runecraft, hunter, construction'''

In [6]:
player_rates = sqlContext.createDataFrame(sqlContext.sql('''
SELECT name, 
date,
ROUND(3600*attack_diff/time_diff, 2) as attack_rate,
ROUND(3600*defence_diff/time_diff, 2) as defence_rate,
ROUND(3600*strength_diff/time_diff, 2) as strength_rate,
ROUND(3600*hitpoints_diff/time_diff, 2) as hitpoints_rate,
ROUND(3600*ranged_diff/time_diff, 2) as ranged_rate,
ROUND(3600*prayer_diff/time_diff, 2) as prayer_rate,
ROUND(3600*magic_diff/time_diff, 2) as magic_rate,
ROUND(3600*cooking_diff/time_diff, 2) as cooking_rate,
ROUND(3600*woodcutting_diff/time_diff, 2) as woodcutting_rate,
ROUND(3600*fletching_diff/time_diff, 2) as fletching_rate,
ROUND(3600*fishing_diff/time_diff, 2) as fishing_rate,
ROUND(3600*firemaking_diff/time_diff, 2) as firemaking_rate,
ROUND(3600*crafting_diff/time_diff, 2) as crafting_rate,
ROUND(3600*smithing_diff/time_diff, 2) as smithing_rate,
ROUND(3600*mining_diff/time_diff, 2) as mining_rate,
ROUND(3600*herblore_diff/time_diff, 2) as herblore_rate,
ROUND(3600*agility_diff/time_diff, 2) as agility_rate,
ROUND(3600*thieving_diff/time_diff, 2) as thieving_rate,
ROUND(3600*slayer_diff/time_diff, 2) as slayer_rate,
ROUND(3600*farming_diff/time_diff, 2) as farming_rate,
ROUND(3600*runecraft_diff/time_diff, 2) as runecraft_rate,
ROUND(3600*hunter_diff/time_diff, 2) as hunter_rate,
ROUND(3600*construction_diff/time_diff, 2) as construction_rate
FROM(
SELECT a.name, 
a.date,
(b.date - a.date) as time_diff, 
(b.overall - a.overall) as overall_diff,
(b.attack - a.attack) as attack_diff,
(b.defence - a.defence) as defence_diff,
(b.strength - a.strength) as strength_diff,
(b.hitpoints - a.hitpoints) as hitpoints_diff,
(b.ranged - a.ranged) as ranged_diff,
(b.prayer - a.prayer) as prayer_diff,
(b.magic - a.magic) as magic_diff,
(b.cooking - a.cooking) as cooking_diff,
(b.woodcutting - a.woodcutting) as woodcutting_diff,
(b.fletching - a.fletching) as fletching_diff,
(b.fishing - a.fishing) as fishing_diff,
(b.firemaking - a.firemaking) as firemaking_diff,
(b.crafting - a.crafting) as crafting_diff,
(b.smithing - a.smithing) as smithing_diff,
(b.mining - a.mining) as mining_diff,
(b.herblore - a.herblore) as herblore_diff,
(b.agility - a.agility) as agility_diff,
(b.thieving - a.thieving) as thieving_diff,
(b.slayer - a.slayer) as slayer_diff,
(b.farming - a.farming) as farming_diff,
(b.runecraft - a.runecraft) as runecraft_diff,
(b.hunter - a.hunter) as hunter_diff,
(b.construction - a.construction) as construction_diff
FROM players_xp as a
JOIN (SELECT *, LAG(date) OVER (PARTITION BY name ORDER BY date) as lag_date FROM players_xp) AS b
ON a.name = b.name AND a.date = b.lag_date)
WHERE overall_diff != 0''').collect())

In [10]:
rates_df = player_rates.toPandas()

In [12]:
len(rates_df)

12882

In [157]:
# does not work, not sure where error is
sqlContext.sql('''
SELECT a.name, 
a.date,
3600*(b.attack - a.attack)/b.time_diff as attack_rate,
3600*(b.defence - a.defence)/b.time_diff as defence_rate,
3600*(b.strength - a.strength)/b.time_diff as strength_rate,
3600*(b.hitpoints - a.hitpoints)/b.time_diff as hitpoints_rate,
3600*(b.ranged - a.ranged)/b.time_diff as ranged_rate,
3600*(b.prayer - a.prayer)/b.time_diff as prayer_rate,
3600*(b.magic - a.magic)f/b.time_diff as magic_rate,
3600*(b.cooking - a.cooking)/b.time_diff as cooking_rate,
3600*(b.woodcutting - a.woodcutting)/b.time_diff as woodcutting_rate,
3600*(b.fletching - a.fletching)/b.time_diff as fletching_rate,
3600*(b.fishing - a.fishing)/b.time_diff as fishing_rate,
3600*(b.firemaking - a.firemaking)/b.time_diff as firemaking_rate,
3600*(b.crafting - a.crafting)/b.time_diff as crafting_rate,
3600*(b.smithing - a.smithing)/time_db.time_diffiff as smithing_rate,
3600*(b.mining - a.mining)/b.time_diff as mining_rate,
3600*(b.herblore - a.herblore)/b.time_diff as herblore_rate,
3600*(b.agility - a.agility)/b.time_diff as agility_rate,
3600*(b.thieving - a.thieving)/b.time_diff as thieving_rate,
3600*(b.slayer - a.slayer)/b.time_diff as slayer_rate,
3600*(b.farming - a.farming)/b.time_diff as farming_rate,
3600*(b.runecraft - a.runecraft)/b.time_diff as runecraft_rate,
3600*(b.hunter - a.hunter)/b.time_diff as hunter_rate,
3600*(b.construction - a.construction)/b.time_diff as construction_rate
FROM players_xp as a
JOIN (
SELECT c.*, d.lag_date, (d.date - c.date) as time_diff
FROM players_xp AS c
JOIN 
(SELECT name, date, LAG(date) OVER (PARTITION BY name ORDER BY date) AS lag_date 
FROM players_xp) AS d
ON c.name = d.name AND c.date = d.lag_date) AS b
ON a.name = b.name AND a.date = b.date''').collect()

ParseException: "\nmismatched input '/' expecting <EOF>(line 10, pos 25)\n\n== SQL ==\n\nSELECT a.name, \na.date,\n3600*(b.attack - a.attack)/b.time_diff as attack_rate,\n3600*(b.defence - a.defence)/b.time_diff as defence_rate,\n3600*(b.strength - a.strength)/b.time_diff as strength_rate,\n3600*(b.hitpoints - a.hitpoints)/b.time_diff as hitpoints_rate,\n3600*(b.ranged - a.ranged)/b.time_diff as ranged_rate,\n3600*(b.prayer - a.prayer)/b.time_diff as prayer_rate,\n3600*(b.magic - a.magic)f/b.time_diff as magic_rate,\n-------------------------^^^\n3600*(b.cooking - a.cooking)/b.time_diff as cooking_rate,\n3600*(b.woodcutting - a.woodcutting)/b.time_diff as woodcutting_rate,\n3600*(b.fletching - a.fletching)/b.time_diff as fletching_rate,\n3600*(b.fishing - a.fishing)/b.time_diff as fishing_rate,\n3600*(b.firemaking - a.firemaking)/b.time_diff as firemaking_rate,\n3600*(b.crafting - a.crafting)/b.time_diff as crafting_rate,\n3600*(b.smithing - a.smithing)/time_db.time_diffiff as smithing_rate,\n3600*(b.mining - a.mining)/b.time_diff as mining_rate,\n3600*(b.herblore - a.herblore)/b.time_diff as herblore_rate,\n3600*(b.agility - a.agility)/b.time_diff as agility_rate,\n3600*(b.thieving - a.thieving)/b.time_diff as thieving_rate,\n3600*(b.slayer - a.slayer)/b.time_diff as slayer_rate,\n3600*(b.farming - a.farming)/b.time_diff as farming_rate,\n3600*(b.runecraft - a.runecraft)/b.time_diff as runecraft_rate,\n3600*(b.hunter - a.hunter)/b.time_diff as hunter_rate,\n3600*(b.construction - a.construction)/b.time_diff as construction_rate\nFROM players_xp as a\nJOIN (\nSELECT c.*, d.lag_date, (d.date - c.date) as time_diff\nFROM players_xp AS c\nJOIN \n(SELECT name, date, LAG(date) OVER (PARTITION BY name ORDER BY date) AS lag_date \nFROM players_xp) AS d\nON c.name = d.name AND c.date = d.lag_date) AS b\nON a.name = b.name AND a.date = b.date\n"

In [186]:
player_rates2.write.save('/data/player_rates.json', format='json')

Py4JJavaError: An error occurred while calling o2614.save.
: org.apache.spark.SparkException: Job aborted.
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:198)
	at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:159)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:104)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:102)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.doExecute(commands.scala:122)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:80)
	at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:80)
	at org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:668)
	at org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:668)
	at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:78)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:73)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:668)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:276)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:270)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:228)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
	at java.lang.reflect.Method.invoke(Unknown Source)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Unknown Source)
Caused by: org.apache.spark.SparkException: Job aborted due to stage failure: Task 3 in stage 134.0 failed 1 times, most recent failure: Lost task 3.0 in stage 134.0 (TID 9082, localhost, executor driver): java.io.IOException: (null) entry in command string: null chmod 0644 C:\data\player_rates.json\_temporary\0\_temporary\attempt_20190527163355_0134_m_000003_9082\part-00003-8068737a-a15b-45bc-805a-066c064f466b-c000.json
	at org.apache.hadoop.util.Shell$ShellCommandExecutor.execute(Shell.java:770)
	at org.apache.hadoop.util.Shell.execCommand(Shell.java:866)
	at org.apache.hadoop.util.Shell.execCommand(Shell.java:849)
	at org.apache.hadoop.fs.RawLocalFileSystem.setPermission(RawLocalFileSystem.java:733)
	at org.apache.hadoop.fs.RawLocalFileSystem$LocalFSFileOutputStream.<init>(RawLocalFileSystem.java:225)
	at org.apache.hadoop.fs.RawLocalFileSystem$LocalFSFileOutputStream.<init>(RawLocalFileSystem.java:209)
	at org.apache.hadoop.fs.RawLocalFileSystem.createOutputStreamWithMode(RawLocalFileSystem.java:307)
	at org.apache.hadoop.fs.RawLocalFileSystem.create(RawLocalFileSystem.java:296)
	at org.apache.hadoop.fs.RawLocalFileSystem.create(RawLocalFileSystem.java:328)
	at org.apache.hadoop.fs.ChecksumFileSystem$ChecksumFSOutputSummer.<init>(ChecksumFileSystem.java:398)
	at org.apache.hadoop.fs.ChecksumFileSystem.create(ChecksumFileSystem.java:461)
	at org.apache.hadoop.fs.ChecksumFileSystem.create(ChecksumFileSystem.java:440)
	at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:911)
	at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:892)
	at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:789)
	at org.apache.spark.sql.execution.datasources.CodecStreams$.createOutputStream(CodecStreams.scala:81)
	at org.apache.spark.sql.execution.datasources.CodecStreams$.createOutputStreamWriter(CodecStreams.scala:92)
	at org.apache.spark.sql.execution.datasources.json.JsonOutputWriter.<init>(JsonFileFormat.scala:183)
	at org.apache.spark.sql.execution.datasources.json.JsonFileFormat$$anon$1.newInstance(JsonFileFormat.scala:82)
	at org.apache.spark.sql.execution.datasources.SingleDirectoryDataWriter.newOutputWriter(FileFormatDataWriter.scala:120)
	at org.apache.spark.sql.execution.datasources.SingleDirectoryDataWriter.<init>(FileFormatDataWriter.scala:108)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask(FileFormatWriter.scala:236)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply(FileFormatWriter.scala:170)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply(FileFormatWriter.scala:169)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:121)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:403)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:409)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
	at java.lang.Thread.run(Unknown Source)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1889)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1877)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1876)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1876)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2110)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2059)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2048)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:737)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:167)
	... 32 more
Caused by: java.io.IOException: (null) entry in command string: null chmod 0644 C:\data\player_rates.json\_temporary\0\_temporary\attempt_20190527163355_0134_m_000003_9082\part-00003-8068737a-a15b-45bc-805a-066c064f466b-c000.json
	at org.apache.hadoop.util.Shell$ShellCommandExecutor.execute(Shell.java:770)
	at org.apache.hadoop.util.Shell.execCommand(Shell.java:866)
	at org.apache.hadoop.util.Shell.execCommand(Shell.java:849)
	at org.apache.hadoop.fs.RawLocalFileSystem.setPermission(RawLocalFileSystem.java:733)
	at org.apache.hadoop.fs.RawLocalFileSystem$LocalFSFileOutputStream.<init>(RawLocalFileSystem.java:225)
	at org.apache.hadoop.fs.RawLocalFileSystem$LocalFSFileOutputStream.<init>(RawLocalFileSystem.java:209)
	at org.apache.hadoop.fs.RawLocalFileSystem.createOutputStreamWithMode(RawLocalFileSystem.java:307)
	at org.apache.hadoop.fs.RawLocalFileSystem.create(RawLocalFileSystem.java:296)
	at org.apache.hadoop.fs.RawLocalFileSystem.create(RawLocalFileSystem.java:328)
	at org.apache.hadoop.fs.ChecksumFileSystem$ChecksumFSOutputSummer.<init>(ChecksumFileSystem.java:398)
	at org.apache.hadoop.fs.ChecksumFileSystem.create(ChecksumFileSystem.java:461)
	at org.apache.hadoop.fs.ChecksumFileSystem.create(ChecksumFileSystem.java:440)
	at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:911)
	at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:892)
	at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:789)
	at org.apache.spark.sql.execution.datasources.CodecStreams$.createOutputStream(CodecStreams.scala:81)
	at org.apache.spark.sql.execution.datasources.CodecStreams$.createOutputStreamWriter(CodecStreams.scala:92)
	at org.apache.spark.sql.execution.datasources.json.JsonOutputWriter.<init>(JsonFileFormat.scala:183)
	at org.apache.spark.sql.execution.datasources.json.JsonFileFormat$$anon$1.newInstance(JsonFileFormat.scala:82)
	at org.apache.spark.sql.execution.datasources.SingleDirectoryDataWriter.newOutputWriter(FileFormatDataWriter.scala:120)
	at org.apache.spark.sql.execution.datasources.SingleDirectoryDataWriter.<init>(FileFormatDataWriter.scala:108)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask(FileFormatWriter.scala:236)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply(FileFormatWriter.scala:170)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply(FileFormatWriter.scala:169)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:121)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:403)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:409)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
	... 1 more


In [190]:
player_rates2.write.save("./data/player_rates.parquet")

Py4JJavaError: An error occurred while calling o2652.save.
: org.apache.spark.SparkException: Job aborted.
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:198)
	at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:159)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:104)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:102)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.doExecute(commands.scala:122)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:80)
	at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:80)
	at org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:668)
	at org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:668)
	at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:78)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:73)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:668)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:276)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:270)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:228)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
	at java.lang.reflect.Method.invoke(Unknown Source)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Unknown Source)
Caused by: org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in stage 135.0 failed 1 times, most recent failure: Lost task 1.0 in stage 135.0 (TID 9084, localhost, executor driver): java.io.IOException: (null) entry in command string: null chmod 0644 C:\Users\tyler\Documents\projects\bot_detection\data\player_rates.parquet\_temporary\0\_temporary\attempt_20190527163559_0135_m_000001_9084\part-00001-01956b70-2752-4def-9aa9-dbcc5f536263-c000.snappy.parquet
	at org.apache.hadoop.util.Shell$ShellCommandExecutor.execute(Shell.java:770)
	at org.apache.hadoop.util.Shell.execCommand(Shell.java:866)
	at org.apache.hadoop.util.Shell.execCommand(Shell.java:849)
	at org.apache.hadoop.fs.RawLocalFileSystem.setPermission(RawLocalFileSystem.java:733)
	at org.apache.hadoop.fs.RawLocalFileSystem$LocalFSFileOutputStream.<init>(RawLocalFileSystem.java:225)
	at org.apache.hadoop.fs.RawLocalFileSystem$LocalFSFileOutputStream.<init>(RawLocalFileSystem.java:209)
	at org.apache.hadoop.fs.RawLocalFileSystem.createOutputStreamWithMode(RawLocalFileSystem.java:307)
	at org.apache.hadoop.fs.RawLocalFileSystem.create(RawLocalFileSystem.java:296)
	at org.apache.hadoop.fs.RawLocalFileSystem.create(RawLocalFileSystem.java:328)
	at org.apache.hadoop.fs.ChecksumFileSystem$ChecksumFSOutputSummer.<init>(ChecksumFileSystem.java:398)
	at org.apache.hadoop.fs.ChecksumFileSystem.create(ChecksumFileSystem.java:461)
	at org.apache.hadoop.fs.ChecksumFileSystem.create(ChecksumFileSystem.java:440)
	at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:911)
	at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:892)
	at org.apache.parquet.hadoop.util.HadoopOutputFile.create(HadoopOutputFile.java:74)
	at org.apache.parquet.hadoop.ParquetFileWriter.<init>(ParquetFileWriter.java:248)
	at org.apache.parquet.hadoop.ParquetOutputFormat.getRecordWriter(ParquetOutputFormat.java:390)
	at org.apache.parquet.hadoop.ParquetOutputFormat.getRecordWriter(ParquetOutputFormat.java:349)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetOutputWriter.<init>(ParquetOutputWriter.scala:37)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anon$1.newInstance(ParquetFileFormat.scala:151)
	at org.apache.spark.sql.execution.datasources.SingleDirectoryDataWriter.newOutputWriter(FileFormatDataWriter.scala:120)
	at org.apache.spark.sql.execution.datasources.SingleDirectoryDataWriter.<init>(FileFormatDataWriter.scala:108)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask(FileFormatWriter.scala:236)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply(FileFormatWriter.scala:170)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply(FileFormatWriter.scala:169)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:121)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:403)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:409)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
	at java.lang.Thread.run(Unknown Source)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1889)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1877)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1876)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1876)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2110)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2059)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2048)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:737)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:167)
	... 32 more
Caused by: java.io.IOException: (null) entry in command string: null chmod 0644 C:\Users\tyler\Documents\projects\bot_detection\data\player_rates.parquet\_temporary\0\_temporary\attempt_20190527163559_0135_m_000001_9084\part-00001-01956b70-2752-4def-9aa9-dbcc5f536263-c000.snappy.parquet
	at org.apache.hadoop.util.Shell$ShellCommandExecutor.execute(Shell.java:770)
	at org.apache.hadoop.util.Shell.execCommand(Shell.java:866)
	at org.apache.hadoop.util.Shell.execCommand(Shell.java:849)
	at org.apache.hadoop.fs.RawLocalFileSystem.setPermission(RawLocalFileSystem.java:733)
	at org.apache.hadoop.fs.RawLocalFileSystem$LocalFSFileOutputStream.<init>(RawLocalFileSystem.java:225)
	at org.apache.hadoop.fs.RawLocalFileSystem$LocalFSFileOutputStream.<init>(RawLocalFileSystem.java:209)
	at org.apache.hadoop.fs.RawLocalFileSystem.createOutputStreamWithMode(RawLocalFileSystem.java:307)
	at org.apache.hadoop.fs.RawLocalFileSystem.create(RawLocalFileSystem.java:296)
	at org.apache.hadoop.fs.RawLocalFileSystem.create(RawLocalFileSystem.java:328)
	at org.apache.hadoop.fs.ChecksumFileSystem$ChecksumFSOutputSummer.<init>(ChecksumFileSystem.java:398)
	at org.apache.hadoop.fs.ChecksumFileSystem.create(ChecksumFileSystem.java:461)
	at org.apache.hadoop.fs.ChecksumFileSystem.create(ChecksumFileSystem.java:440)
	at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:911)
	at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:892)
	at org.apache.parquet.hadoop.util.HadoopOutputFile.create(HadoopOutputFile.java:74)
	at org.apache.parquet.hadoop.ParquetFileWriter.<init>(ParquetFileWriter.java:248)
	at org.apache.parquet.hadoop.ParquetOutputFormat.getRecordWriter(ParquetOutputFormat.java:390)
	at org.apache.parquet.hadoop.ParquetOutputFormat.getRecordWriter(ParquetOutputFormat.java:349)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetOutputWriter.<init>(ParquetOutputWriter.scala:37)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anon$1.newInstance(ParquetFileFormat.scala:151)
	at org.apache.spark.sql.execution.datasources.SingleDirectoryDataWriter.newOutputWriter(FileFormatDataWriter.scala:120)
	at org.apache.spark.sql.execution.datasources.SingleDirectoryDataWriter.<init>(FileFormatDataWriter.scala:108)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask(FileFormatWriter.scala:236)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply(FileFormatWriter.scala:170)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply(FileFormatWriter.scala:169)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:121)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:403)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:409)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
	... 1 more


In [196]:
player_rates2.write.mode('overwrite').save('./data/player_rates.parquet')

Py4JJavaError: An error occurred while calling o2728.save.
: org.apache.spark.SparkException: Job aborted.
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:198)
	at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:159)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:104)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:102)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.doExecute(commands.scala:122)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:80)
	at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:80)
	at org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:668)
	at org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:668)
	at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:78)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:73)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:668)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:276)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:270)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:228)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
	at java.lang.reflect.Method.invoke(Unknown Source)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Unknown Source)
Caused by: org.apache.spark.SparkException: Job aborted due to stage failure: Task 3 in stage 137.0 failed 1 times, most recent failure: Lost task 3.0 in stage 137.0 (TID 9094, localhost, executor driver): java.io.IOException: (null) entry in command string: null chmod 0644 C:\Users\tyler\Documents\projects\bot_detection\data\player_rates.parquet\_temporary\0\_temporary\attempt_20190527163912_0137_m_000003_9094\part-00003-fdaf9b4c-ede5-4099-9527-4ecd6c2a9b7c-c000.snappy.parquet
	at org.apache.hadoop.util.Shell$ShellCommandExecutor.execute(Shell.java:770)
	at org.apache.hadoop.util.Shell.execCommand(Shell.java:866)
	at org.apache.hadoop.util.Shell.execCommand(Shell.java:849)
	at org.apache.hadoop.fs.RawLocalFileSystem.setPermission(RawLocalFileSystem.java:733)
	at org.apache.hadoop.fs.RawLocalFileSystem$LocalFSFileOutputStream.<init>(RawLocalFileSystem.java:225)
	at org.apache.hadoop.fs.RawLocalFileSystem$LocalFSFileOutputStream.<init>(RawLocalFileSystem.java:209)
	at org.apache.hadoop.fs.RawLocalFileSystem.createOutputStreamWithMode(RawLocalFileSystem.java:307)
	at org.apache.hadoop.fs.RawLocalFileSystem.create(RawLocalFileSystem.java:296)
	at org.apache.hadoop.fs.RawLocalFileSystem.create(RawLocalFileSystem.java:328)
	at org.apache.hadoop.fs.ChecksumFileSystem$ChecksumFSOutputSummer.<init>(ChecksumFileSystem.java:398)
	at org.apache.hadoop.fs.ChecksumFileSystem.create(ChecksumFileSystem.java:461)
	at org.apache.hadoop.fs.ChecksumFileSystem.create(ChecksumFileSystem.java:440)
	at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:911)
	at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:892)
	at org.apache.parquet.hadoop.util.HadoopOutputFile.create(HadoopOutputFile.java:74)
	at org.apache.parquet.hadoop.ParquetFileWriter.<init>(ParquetFileWriter.java:248)
	at org.apache.parquet.hadoop.ParquetOutputFormat.getRecordWriter(ParquetOutputFormat.java:390)
	at org.apache.parquet.hadoop.ParquetOutputFormat.getRecordWriter(ParquetOutputFormat.java:349)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetOutputWriter.<init>(ParquetOutputWriter.scala:37)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anon$1.newInstance(ParquetFileFormat.scala:151)
	at org.apache.spark.sql.execution.datasources.SingleDirectoryDataWriter.newOutputWriter(FileFormatDataWriter.scala:120)
	at org.apache.spark.sql.execution.datasources.SingleDirectoryDataWriter.<init>(FileFormatDataWriter.scala:108)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask(FileFormatWriter.scala:236)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply(FileFormatWriter.scala:170)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply(FileFormatWriter.scala:169)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:121)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:403)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:409)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
	at java.lang.Thread.run(Unknown Source)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1889)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1877)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1876)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1876)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2110)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2059)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2048)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:737)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:167)
	... 32 more
Caused by: java.io.IOException: (null) entry in command string: null chmod 0644 C:\Users\tyler\Documents\projects\bot_detection\data\player_rates.parquet\_temporary\0\_temporary\attempt_20190527163912_0137_m_000003_9094\part-00003-fdaf9b4c-ede5-4099-9527-4ecd6c2a9b7c-c000.snappy.parquet
	at org.apache.hadoop.util.Shell$ShellCommandExecutor.execute(Shell.java:770)
	at org.apache.hadoop.util.Shell.execCommand(Shell.java:866)
	at org.apache.hadoop.util.Shell.execCommand(Shell.java:849)
	at org.apache.hadoop.fs.RawLocalFileSystem.setPermission(RawLocalFileSystem.java:733)
	at org.apache.hadoop.fs.RawLocalFileSystem$LocalFSFileOutputStream.<init>(RawLocalFileSystem.java:225)
	at org.apache.hadoop.fs.RawLocalFileSystem$LocalFSFileOutputStream.<init>(RawLocalFileSystem.java:209)
	at org.apache.hadoop.fs.RawLocalFileSystem.createOutputStreamWithMode(RawLocalFileSystem.java:307)
	at org.apache.hadoop.fs.RawLocalFileSystem.create(RawLocalFileSystem.java:296)
	at org.apache.hadoop.fs.RawLocalFileSystem.create(RawLocalFileSystem.java:328)
	at org.apache.hadoop.fs.ChecksumFileSystem$ChecksumFSOutputSummer.<init>(ChecksumFileSystem.java:398)
	at org.apache.hadoop.fs.ChecksumFileSystem.create(ChecksumFileSystem.java:461)
	at org.apache.hadoop.fs.ChecksumFileSystem.create(ChecksumFileSystem.java:440)
	at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:911)
	at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:892)
	at org.apache.parquet.hadoop.util.HadoopOutputFile.create(HadoopOutputFile.java:74)
	at org.apache.parquet.hadoop.ParquetFileWriter.<init>(ParquetFileWriter.java:248)
	at org.apache.parquet.hadoop.ParquetOutputFormat.getRecordWriter(ParquetOutputFormat.java:390)
	at org.apache.parquet.hadoop.ParquetOutputFormat.getRecordWriter(ParquetOutputFormat.java:349)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetOutputWriter.<init>(ParquetOutputWriter.scala:37)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anon$1.newInstance(ParquetFileFormat.scala:151)
	at org.apache.spark.sql.execution.datasources.SingleDirectoryDataWriter.newOutputWriter(FileFormatDataWriter.scala:120)
	at org.apache.spark.sql.execution.datasources.SingleDirectoryDataWriter.<init>(FileFormatDataWriter.scala:108)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask(FileFormatWriter.scala:236)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply(FileFormatWriter.scala:170)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply(FileFormatWriter.scala:169)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:121)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:403)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:409)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
	... 1 more


In [74]:
sqlContext.sql('''CREATE or ALTER VIEW rates  IF NOT EXISTS''')

for skill in skills:
    sqlContext.sql('''SELECT name,
    date, 
    (?),
    LAG(?) OVER (PARTITION by name ORDER BY date) as (lag_(?)) 
    FROM players_xp''', (skill, skill))

ParseException: "\nno viable alternative at input 'CREATE or ALTER'(line 1, pos 10)\n\n== SQL ==\nCREATE or ALTER VIEW rates  IF NOT EXISTS\n----------^^^\n"

In [78]:
sqlContext.sql('''CREATE TABLE rates''')

sqlContext.sql('''SELECT name,
date,
(?), 
LAG(?) OVER (PARTITION by name ORDER BY date) as (lag_(?)) 
FROM players_xp
''', ('overall', overall)).collect()

AnalysisException: "Hive support is required to CREATE Hive TABLE (AS SELECT);;\n'CreateTable `rates`, org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, ErrorIfExists\n"

In [15]:
sqlContext.table('players_xp')

AnalysisException: 'Table or view not found: players_xp;'

In [40]:
#sqlContext.sql('''DROP TABLE players_rank''').collect()

[]

In [9]:
xp_df.groupby('name')

[Row(name='Mjpel3', date=datetime.datetime(2019, 5, 24, 0, 54, 9, 51961), overall=16294050, attack=1031276, defence=860741, strength=1528457, hitpoints=2219614, ranged=3259345, prayer=744358, magic=1458143, cooking=769577, woodcutting=588335, fletching=127612, fishing=676967, firemaking=106485, crafting=173610, smithing=104648, mining=749672, herblore=79803, agility=524779, thieving=169396, slayer=498128, farming=274860, runecraft=120365, hunter=125401, construction=102478),
 Row(name='Z A T 0 XX', date=datetime.datetime(2019, 5, 24, 0, 54, 9, 51961), overall=14392616, attack=309851, defence=65507, strength=2816878, hitpoints=1667719, ranged=1977078, prayer=125434, magic=1484428, cooking=783341, woodcutting=201439, fletching=15791, fishing=166138, firemaking=116305, crafting=318242, smithing=182060, mining=286286, herblore=62756, agility=215569, thieving=305446, slayer=168996, farming=46663, runecraft=21961, hunter=2949725, construction=105003),
 Row(name='Elliebus23', date=datetime.da

In [1]:
from pyhive import hive

In [2]:
host_name = "192.168.0.38"
port = 10000
user = "admin"
password = "password"
database="test_db"

In [3]:
def hiveconnection(host_name, port, user, password, database):
    conn = hive.Connection(host=host_name, port=port, username=user, password=password,
                           database=database, auth='CUSTOM')
    cur = conn.cursor()
    cur.execute('select item_sk,reason_sk, account_credit from returns limit 5')
    result = cur.fetchall()

    return result

In [4]:
output = hiveconnection(host_name, port, user,password, database)
print(output)

KeyboardInterrupt: 

In [46]:
from sqlalchemy import create_engine

host = 'hive://tyler@localhost'
host_name = 'localhost'
port = 10000
user = 'tyler'
password = 'tyler'
database = 'osrs'

engine = create_engine('hive://{0}:{1}'.format(host, port))
#engine.execute(f'CREATE DATABASE {database}')
#engine.execute(f'USE {database}')

In [18]:
engine.execute('CREATE TABLE players')

Engine(hive://tyler@localhost:10000)

In [30]:
from pyhive import hive
engine2 = hive.Connection(host = host, port = port)

failed to resolve sockaddr for hive://tyler@localhost:10000
Traceback (most recent call last):
  File "C:\Users\tyler\Anaconda3\envs\osrs\lib\site-packages\thrift\transport\TSocket.py", line 95, in open
    addrs = self._resolveAddr()
  File "C:\Users\tyler\Anaconda3\envs\osrs\lib\site-packages\thrift\transport\TSocket.py", line 42, in _resolveAddr
    socket.AI_PASSIVE | socket.AI_ADDRCONFIG)
  File "C:\Users\tyler\Anaconda3\envs\osrs\lib\socket.py", line 748, in getaddrinfo
    for res in _socket.getaddrinfo(host, port, family, type, proto, flags):
socket.gaierror: [Errno 11003] getaddrinfo failed


TTransportException: failed to resolve sockaddr for hive://tyler@localhost:10000

In [41]:
def hiveconnection(host_name, port, user,password, database):
    conn = hive.Connection(host=host_name, port=port)

In [47]:
engine2 = hive.Connection(host=host, port=port)

failed to resolve sockaddr for hive://tyler@localhost:10000
Traceback (most recent call last):
  File "C:\Users\tyler\Anaconda3\envs\osrs\lib\site-packages\thrift\transport\TSocket.py", line 95, in open
    addrs = self._resolveAddr()
  File "C:\Users\tyler\Anaconda3\envs\osrs\lib\site-packages\thrift\transport\TSocket.py", line 42, in _resolveAddr
    socket.AI_PASSIVE | socket.AI_ADDRCONFIG)
  File "C:\Users\tyler\Anaconda3\envs\osrs\lib\socket.py", line 748, in getaddrinfo
    for res in _socket.getaddrinfo(host, port, family, type, proto, flags):
socket.gaierror: [Errno 11003] getaddrinfo failed


TTransportException: failed to resolve sockaddr for hive://tyler@localhost:10000

In [50]:
from pyhive import hive
conn = hive.connect('localhost', port=10000, auth='NOSASL')
#cursor.execute('SELECT * from sample_07 LIMIT 5',async=True)
print(conn)

Could not connect to any of [('::1', 10000, 0, 0), ('127.0.0.1', 10000)]


TTransportException: Could not connect to any of [('::1', 10000, 0, 0), ('127.0.0.1', 10000)]

In [53]:
conn = hive.connect('localhost')

Could not connect to any of [('::1', 10000, 0, 0), ('127.0.0.1', 10000)]


TTransportException: Could not connect to any of [('::1', 10000, 0, 0), ('127.0.0.1', 10000)]

In [54]:
engine = create_engine('hive://localhost:10000/default')

In [59]:
engine = create_engine(
    'hive://tyler:tyler@localhost:10000/osrs',
    connect_args={'auth': 'LDAP'},
)

engine.execute('CREATE DATABASE osrs')

Could not connect to any of [('::1', 10000, 0, 0), ('127.0.0.1', 10000)]


TTransportException: Could not connect to any of [('::1', 10000, 0, 0), ('127.0.0.1', 10000)]

In [58]:
conn = hive.connect('hive://tyler:tyler@localhost:10000/osrs')

failed to resolve sockaddr for hive://tyler:tyler@localhost:10000/osrs:10000
Traceback (most recent call last):
  File "C:\Users\tyler\Anaconda3\envs\osrs\lib\site-packages\thrift\transport\TSocket.py", line 95, in open
    addrs = self._resolveAddr()
  File "C:\Users\tyler\Anaconda3\envs\osrs\lib\site-packages\thrift\transport\TSocket.py", line 42, in _resolveAddr
    socket.AI_PASSIVE | socket.AI_ADDRCONFIG)
  File "C:\Users\tyler\Anaconda3\envs\osrs\lib\socket.py", line 748, in getaddrinfo
    for res in _socket.getaddrinfo(host, port, family, type, proto, flags):
socket.gaierror: [Errno 11003] getaddrinfo failed


TTransportException: failed to resolve sockaddr for hive://tyler:tyler@localhost:10000/osrs:10000