In [1]:
import pyspark
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [2]:
def rename_columns(df, list_of_tuples):
    for (old_col, new_col) in list_of_tuples:
        df = df.withColumnRenamed(old_col, new_col)
    return df

def read_hdfs_folder(sqlContext, folderpath):
    data_frame = sqlContext.read.csv(folderpath, header=True,
                                     inferSchema=True,nullValue="-")
    return data_frame

def read_buste_data_v3(sqlContext, folderpath):
    data_frame = read_hdfs_folder(sqlContext,folderpath)
    
    date = "-".join(folderpath.split("/")[-2].split("_")[:3])

    data_frame = data_frame.withColumn("date", F.lit(date))
    
    return data_frame

def printdf(df,l=10):
    return df.limit(l).toPandas()

In [3]:
spark  = SparkSession.builder.getOrCreate()
sc = spark.sparkContext
sqlContext = pyspark.SQLContext(sc)

### Reading Data

In [31]:
base_folder_path = '/local/tarciso/masters/data/bus_trips/buste-v3a/may_jun_jul/'




In [None]:
od_matrix = read_hdfs_folder(sqlContext,base_folder_path + 'filtered_od/')

In [14]:
od_matrix.printSchema()

root
 |-- route: integer (nullable = true)
 |-- tripNum: integer (nullable = true)
 |-- shapeId: integer (nullable = true)
 |-- shapeSequence: integer (nullable = true)
 |-- shapeLat: double (nullable = true)
 |-- shapeLon: double (nullable = true)
 |-- distanceTraveledShape: double (nullable = true)
 |-- busCode: string (nullable = true)
 |-- gpsPointId: string (nullable = true)
 |-- gpsLat: double (nullable = true)
 |-- gpsLon: double (nullable = true)
 |-- distanceToShapePoint: double (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- stopPointId: integer (nullable = true)
 |-- problem: string (nullable = true)
 |-- birthdate: string (nullable = true)
 |-- cardTimestamp: string (nullable = true)
 |-- lineName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- date: integer (nullable = true)
 |-- id: long (nullable = true)
 |-- o_route: integer (nullable = true)
 |-- o_bus_code: string (nullable = true)
 |-- o_date: integer (nullable = true)
 |-- o_trip

In [15]:
od_matrix = od_matrix.withColumn('date_str', F.from_unixtime(F.col('date'), 'yyyy-MM-dd'))

In [16]:
printdf(od_matrix.select(['date_str','route','o_stop_id','o_timestamp','stopPointId','timestamp','o_boarding_id']))

Unnamed: 0,date_str,route,o_stop_id,o_timestamp,stopPointId,timestamp,o_boarding_id
0,2017-05-04,180,28879,17:19:42,28991,19:53:56,498216206336
1,2017-05-04,311,32532,15:32:13,30669,23:01:50,678604832771
2,2017-05-04,2,29080,13:21:26,40029,14:53:57,1443109011464
3,2017-05-04,366,29099,09:23:24,29908,09:23:46,429496729607
4,2017-05-04,40,34119,06:25:54,30433,17:37:09,128849018889
5,2017-05-04,60,33267,18:13:14,33185,20:39:18,1546188226564
6,2017-05-04,684,32043,16:51:00,39262,19:16:12,1288490188820
7,2017-05-04,462,31481,10:38:25,26157,11:07:45,1039382085653
8,2017-05-04,50,28391,15:59:00,31800,18:42:35,1675037245459
9,2017-05-04,370,3377,07:16:17,3377,12:28:58,1468878815256


In [17]:
od_matrix.count()

386507

### Saving data to their respective date folders

In [20]:
dates = od_matrix.select('date_str').distinct().collect()

In [25]:
str(dates[0]['date_str'])

'2017-04-30'

In [33]:
for row in dates:
    curr_date = str(row['date_str'])
    print "Processing date", curr_date
    date_od_mat = od_matrix.filter(od_matrix.date_str == curr_date)
    print "Num records for this date:", date_od_mat.count()
    dest_folderpath = base_folder_path + '/od_days/' + curr_date.replace('-','_') + '_od'
    print dest_folderpath
    date_od_mat.write.csv(path=dest_folderpath,header=True, mode='overwrite')
    

 Processing date 2017-04-30
Num records for this date: 731
/local/tarciso/masters/data/bus_trips/buste-v3a/may_jun_jul//od_days/2017_04_30_od
Processing date 2017-05-04
Num records for this date: 30774
/local/tarciso/masters/data/bus_trips/buste-v3a/may_jun_jul//od_days/2017_05_04_od
Processing date 2017-06-22
Num records for this date: 2876
/local/tarciso/masters/data/bus_trips/buste-v3a/may_jun_jul//od_days/2017_06_22_od
Processing date 2017-05-05
Num records for this date: 732
/local/tarciso/masters/data/bus_trips/buste-v3a/may_jun_jul//od_days/2017_05_05_od
Processing date 2017-06-29
Num records for this date: 27293
/local/tarciso/masters/data/bus_trips/buste-v3a/may_jun_jul//od_days/2017_06_29_od
Processing date 2017-06-24
Num records for this date: 11492
/local/tarciso/masters/data/bus_trips/buste-v3a/may_jun_jul//od_days/2017_06_24_od
Processing date 2017-06-19
Num records for this date: 3230
/local/tarciso/masters/data/bus_trips/buste-v3a/may_jun_jul//od_days/2017_06_19_od
Proc