In [1]:
import findspark
findspark.init()
import pyspark
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from pyspark.sql.functions import *
import pandas as pd
import csv
import random
from faker import Faker
from datetime import datetime
import random
from pyspark.sql.functions import *
from pyspark.sql import Window
from pyspark.sql.functions import col

conf = pyspark.SparkConf().setAppName("App")
conf = (conf.setMaster('local[*]')
        .set('spark.executor.memory', '4G')
        .set('spark.driver.memory', '4G')
        .set('spark.driver.maxResultSize', '10G'))
sc = pyspark.SparkContext(conf=conf)

In [4]:
%%time
#https://faker.readthedocs.io/en/latest/providers/faker.providers.address.html

def create_test_data():
    f=Faker('en_US') 
    g=open("phone_logs.csv","w")
    w=csv.writer(g)
    w.writerow(('start_time','from','to','duration', 'region','position'))
    
    for call_line in range(250000):
        duration = random.randrange(1,600,1)
        start_time = f.date_time_this_month(before_now=True, after_now=False, tzinfo=None)
        region = f.state_abbr()
        #call_from = random.randrange(1,600000,1)
        call_from = f.msisdn()
        call_to = f.msisdn()
        position = str(f.longitude()) + '|' + str(f.latitude())
        w.writerow((start_time,
                        call_from,
                        call_to,
                        duration,
                        region,
                        position))
    g.close()
create_test_data()

CPU times: user 48.2 s, sys: 1.44 s, total: 49.7 s
Wall time: 49.6 s


In [5]:
df = pd.read_csv('phone_logs.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250000 entries, 0 to 249999
Data columns (total 6 columns):
start_time    250000 non-null object
from          250000 non-null int64
to            250000 non-null int64
duration      250000 non-null int64
region        250000 non-null object
position      250000 non-null object
dtypes: int64(3), object(3)
memory usage: 11.4+ MB


## Read the data

In [7]:
%%time
sqlContext = SQLContext(sc)

#read CSV
df = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('phone_logs.csv')

CPU times: user 12 ms, sys: 0 ns, total: 12 ms
Wall time: 3.35 s


In [8]:
df.show()

+-------------------+-------------+-------------+--------+------+--------------------+
|         start_time|         from|           to|duration|region|            position|
+-------------------+-------------+-------------+--------+------+--------------------+
|2018-01-14 18:11:47|2008143579347|8915430633821|     121|    IA|136.144484|76.578116|
|2018-01-05 13:29:41|6327084691796|5866415777536|     116|    GU|-98.028822|26.617...|
|2018-01-14 19:11:43| 174185996314|8093019252261|     180|    SC|8.535783|-62.4042065|
|2018-01-02 01:01:40|4635352389914|7232688231735|     173|    OH|64.535337|-86.098...|
|2018-01-12 21:39:44|7354660319439|7565515645157|     369|    NC|-124.866182|12.56...|
|2018-01-05 08:21:31|4296776422700|9565956723960|     274|    ID|-18.500108|-33.12...|
|2018-01-05 04:06:43|1129732342357|7110853029520|     364|    DC| 32.258607|14.231451|
|2018-01-11 19:40:58|7462618070254|8840972218289|     101|    KY|102.040054|-42.59...|
|2018-01-10 06:40:00|7390280538002|72406447

In [9]:
df.registerTempTable("calls")

In [10]:
df.printSchema()

root
 |-- start_time: timestamp (nullable = true)
 |-- from: long (nullable = true)
 |-- to: long (nullable = true)
 |-- duration: integer (nullable = true)
 |-- region: string (nullable = true)
 |-- position: string (nullable = true)



In [11]:
#df.groupby('from','region').agg(sum('duration').alias('total_time')).show()
    

In [12]:

def calculate_statistic(data_frame, N=5):
    data_frame = data_frame.groupby('from','region').agg(sum('duration').alias('total_time'))
    window = Window.partitionBy("region").orderBy(col("total_time").desc())
    data_frame = data_frame.withColumn("rank", dense_rank().over(window))
    return data_frame.withColumn('rnum', row_number().over(window)).filter(col("rnum") <= N)
    


In [13]:
%%time
N = 5
df2 = calculate_statistic(df, N)
df2.show()

+-------------+------+----------+----+----+
|         from|region|total_time|rank|rnum|
+-------------+------+----------+----+----+
|5746651522533|    AZ|       599|   1|   1|
|5026008121924|    AZ|       599|   1|   2|
|7372969699888|    AZ|       599|   1|   3|
|9601834467372|    AZ|       599|   1|   4|
|5218367820017|    AZ|       599|   1|   5|
|4157535881752|    SC|       599|   1|   1|
|1441330719479|    SC|       599|   1|   2|
|3524078496258|    SC|       599|   1|   3|
|7931358776272|    SC|       599|   1|   4|
|4185932299010|    SC|       599|   1|   5|
|8617580335321|    LA|       599|   1|   1|
|2360814160828|    LA|       599|   1|   2|
|7317897713460|    LA|       599|   1|   3|
|1016923998080|    LA|       599|   1|   4|
| 124684958630|    LA|       599|   1|   5|
|7521802805458|    MN|       599|   1|   1|
|8148129313009|    MN|       599|   1|   2|
|4323290802591|    MN|       599|   1|   3|
|6817126451767|    MN|       599|   1|   4|
|8875700759840|    MN|       599

In [14]:
%%time
#df.write.parquet('input-parquet')
df_parquet = sqlContext.read.parquet('input-parquet')
df2 = calculate_statistic(df_parquet, N)
df2.show()

+------+------+----------+----+----+
|  from|region|total_time|rank|rnum|
+------+------+----------+----+----+
|304799|    AZ|      1358|   1|   1|
|173140|    AZ|      1303|   2|   2|
|203211|    AZ|      1292|   3|   3|
|335253|    AZ|      1241|   4|   4|
|592513|    AZ|      1223|   5|   5|
|383799|    SC|      1365|   1|   1|
|531464|    SC|      1343|   2|   2|
|470616|    SC|      1322|   3|   3|
|467360|    SC|      1301|   4|   4|
|361734|    SC|      1290|   5|   5|
|402650|    LA|      1604|   1|   1|
|328353|    LA|      1582|   2|   2|
|436735|    LA|      1509|   3|   3|
|197125|    LA|      1410|   4|   4|
|434332|    LA|      1387|   5|   5|
|326652|    MN|      1523|   1|   1|
| 79227|    MN|      1454|   2|   2|
|226773|    MN|      1374|   3|   3|
|199326|    MN|      1283|   4|   4|
|287831|    MN|      1244|   5|   5|
+------+------+----------+----+----+
only showing top 20 rows

CPU times: user 8 ms, sys: 0 ns, total: 8 ms
Wall time: 3.18 s
