In [1]:
from pyspark.sql.functions import hour,minute,second,col,avg,when
from pyspark.sql import SQLContext
import pyspark.sql.functions as sql_functions
import datetime
sqlContext = SQLContext(sc)

In [2]:
df = sqlContext.read.csv("hdfs://localhost:54310/user/hduser/user_log_data.csv",header = True, inferSchema = True)

In [3]:
df.show()

+--------------------+-------------------+-------------------+-------------------+-------------------+
|           user_name|          idle_time|      working_hours|         start_time|           end_time|
+--------------------+-------------------+-------------------+-------------------+-------------------+
|  sahil24c@gmail.com|2019-10-24 05:05:00|2019-10-24 05:50:00|2019-10-24 08:30:02|2019-10-24 19:25:02|
|magadum.iranna@gm...|2019-10-24 02:15:00|2019-10-24 08:39:59|2019-10-24 08:30:02|2019-10-24 19:25:01|
|  yathink3@gmail.com|2019-10-24 01:30:00|2019-10-24 09:24:59|2019-10-24 08:30:02|2019-10-24 19:25:01|
|  shelkeva@gmail.com|2019-10-24 00:30:00|2019-10-24 09:10:01|2019-10-24 08:45:01|2019-10-24 18:25:02|
|puruissimple@gmai...|2019-10-24 03:15:00|2019-10-24 07:19:59|2019-10-24 08:50:02|2019-10-24 19:25:01|
|sangita.awaghad19...|2019-10-24 01:55:00|2019-10-24 08:40:00|2019-10-24 08:50:01|2019-10-24 19:25:01|
|vaishusawant143@g...|2019-10-24 00:35:00|2019-10-24 09:55:00|2019-10-24 

In [4]:
df.printSchema()

root
 |-- user_name: string (nullable = true)
 |-- idle_time: timestamp (nullable = true)
 |-- working_hours: timestamp (nullable = true)
 |-- start_time: timestamp (nullable = true)
 |-- end_time: timestamp (nullable = true)



In [5]:

df1 = df.withColumn('hours', hour(df['working_hours']))

In [6]:
x = df1.select('user_name','hours')

In [7]:
x.show()

+--------------------+-----+
|           user_name|hours|
+--------------------+-----+
|  sahil24c@gmail.com|    5|
|magadum.iranna@gm...|    8|
|  yathink3@gmail.com|    9|
|  shelkeva@gmail.com|    9|
|puruissimple@gmai...|    7|
|sangita.awaghad19...|    8|
|vaishusawant143@g...|    9|
|     you@example.com|    8|
|samadhanmahajan73...|    8|
|vishnu23kumar@gma...|    6|
|ashutoshrit64@gma...|   10|
|akshaybavalekar10...|    8|
|khairnarswapna99@...|    9|
|kukadeshilpaa7m95...|    9|
|sarikabarge111@gm...|    8|
|narsimharaj.kasu0...|    8|
|antonyalexcm@gmai...|   10|
|jitupatil937@gmai...|    9|
|akshaypatwari24@g...|    9|
|aheteshams007@gma...|    8|
+--------------------+-----+
only showing top 20 rows



In [8]:

x = x.withColumn('h_s', x['hours'] * 3600)

In [9]:
x.show()

+--------------------+-----+-----+
|           user_name|hours|  h_s|
+--------------------+-----+-----+
|  sahil24c@gmail.com|    5|18000|
|magadum.iranna@gm...|    8|28800|
|  yathink3@gmail.com|    9|32400|
|  shelkeva@gmail.com|    9|32400|
|puruissimple@gmai...|    7|25200|
|sangita.awaghad19...|    8|28800|
|vaishusawant143@g...|    9|32400|
|     you@example.com|    8|28800|
|samadhanmahajan73...|    8|28800|
|vishnu23kumar@gma...|    6|21600|
|ashutoshrit64@gma...|   10|36000|
|akshaybavalekar10...|    8|28800|
|khairnarswapna99@...|    9|32400|
|kukadeshilpaa7m95...|    9|32400|
|sarikabarge111@gm...|    8|28800|
|narsimharaj.kasu0...|    8|28800|
|antonyalexcm@gmai...|   10|36000|
|jitupatil937@gmai...|    9|32400|
|akshaypatwari24@g...|    9|32400|
|aheteshams007@gma...|    8|28800|
+--------------------+-----+-----+
only showing top 20 rows



In [10]:
df2 = df.withColumn('min', minute(df['working_hours']))

In [11]:
y = df2.select('user_name', 'min')


In [12]:
y = y.withColumn('m_s', y['min'] * 60)
y.show()

+--------------------+---+----+
|           user_name|min| m_s|
+--------------------+---+----+
|  sahil24c@gmail.com| 50|3000|
|magadum.iranna@gm...| 39|2340|
|  yathink3@gmail.com| 24|1440|
|  shelkeva@gmail.com| 10| 600|
|puruissimple@gmai...| 19|1140|
|sangita.awaghad19...| 40|2400|
|vaishusawant143@g...| 55|3300|
|     you@example.com|  4| 240|
|samadhanmahajan73...| 39|2340|
|vishnu23kumar@gma...| 10| 600|
|ashutoshrit64@gma...| 25|1500|
|akshaybavalekar10...| 35|2100|
|khairnarswapna99@...| 25|1500|
|kukadeshilpaa7m95...| 40|2400|
|sarikabarge111@gm...| 45|2700|
|narsimharaj.kasu0...|  5| 300|
|antonyalexcm@gmai...| 15| 900|
|jitupatil937@gmai...| 20|1200|
|akshaypatwari24@g...|  5| 300|
|aheteshams007@gma...| 50|3000|
+--------------------+---+----+
only showing top 20 rows



In [13]:
df3 = df.withColumn('sec', second(df['working_hours']))

In [14]:
z = df3.select('user_name', 'sec')
z.show()

+--------------------+---+
|           user_name|sec|
+--------------------+---+
|  sahil24c@gmail.com|  0|
|magadum.iranna@gm...| 59|
|  yathink3@gmail.com| 59|
|  shelkeva@gmail.com|  1|
|puruissimple@gmai...| 59|
|sangita.awaghad19...|  0|
|vaishusawant143@g...|  0|
|     you@example.com| 59|
|samadhanmahajan73...| 59|
|vishnu23kumar@gma...|  0|
|ashutoshrit64@gma...|  1|
|akshaybavalekar10...|  0|
|khairnarswapna99@...|  0|
|kukadeshilpaa7m95...|  0|
|sarikabarge111@gm...|  0|
|narsimharaj.kasu0...|  0|
|antonyalexcm@gmai...|  0|
|jitupatil937@gmai...|  0|
|akshaypatwari24@g...|  0|
|aheteshams007@gma...|  1|
+--------------------+---+
only showing top 20 rows



In [15]:
#Joining the dataframes
df4 = x.join(y, on = ['user_name'], how = 'inner')

In [16]:
df4.show()

+--------------------+-----+-----+---+----+
|           user_name|hours|  h_s|min| m_s|
+--------------------+-----+-----+---+----+
|  sahil24c@gmail.com|    5|18000| 50|3000|
|magadum.iranna@gm...|    8|28800| 39|2340|
|  yathink3@gmail.com|    9|32400| 24|1440|
|  shelkeva@gmail.com|    9|32400| 10| 600|
|puruissimple@gmai...|    7|25200| 19|1140|
|sangita.awaghad19...|    8|28800| 40|2400|
|vaishusawant143@g...|    9|32400| 55|3300|
|     you@example.com|    8|28800|  4| 240|
|samadhanmahajan73...|    8|28800| 39|2340|
|vishnu23kumar@gma...|    6|21600| 10| 600|
|ashutoshrit64@gma...|   10|36000| 25|1500|
|akshaybavalekar10...|    8|28800| 35|2100|
|khairnarswapna99@...|    9|32400| 25|1500|
|kukadeshilpaa7m95...|    9|32400| 40|2400|
|sarikabarge111@gm...|    8|28800| 45|2700|
|narsimharaj.kasu0...|    8|28800|  5| 300|
|antonyalexcm@gmai...|   10|36000| 15| 900|
|jitupatil937@gmai...|    9|32400| 20|1200|
|akshaypatwari24@g...|    9|32400|  5| 300|
|aheteshams007@gma...|    8|2880

In [17]:
df5 = df4.join(z, on = ['user_name'], how = 'inner') 

In [18]:
df5.show()

+--------------------+-----+-----+---+----+---+
|           user_name|hours|  h_s|min| m_s|sec|
+--------------------+-----+-----+---+----+---+
|  sahil24c@gmail.com|    5|18000| 50|3000|  0|
|magadum.iranna@gm...|    8|28800| 39|2340| 59|
|  yathink3@gmail.com|    9|32400| 24|1440| 59|
|  shelkeva@gmail.com|    9|32400| 10| 600|  1|
|puruissimple@gmai...|    7|25200| 19|1140| 59|
|sangita.awaghad19...|    8|28800| 40|2400|  0|
|vaishusawant143@g...|    9|32400| 55|3300|  0|
|     you@example.com|    8|28800|  4| 240| 59|
|samadhanmahajan73...|    8|28800| 39|2340| 59|
|vishnu23kumar@gma...|    6|21600| 10| 600|  0|
|ashutoshrit64@gma...|   10|36000| 25|1500|  1|
|akshaybavalekar10...|    8|28800| 35|2100|  0|
|khairnarswapna99@...|    9|32400| 25|1500|  0|
|kukadeshilpaa7m95...|    9|32400| 40|2400|  0|
|sarikabarge111@gm...|    8|28800| 45|2700|  0|
|narsimharaj.kasu0...|    8|28800|  5| 300|  0|
|antonyalexcm@gmai...|   10|36000| 15| 900|  0|
|jitupatil937@gmai...|    9|32400| 20|12

In [19]:
df6 = df5.drop('hours', 'min')

In [20]:
df6.show()

+--------------------+-----+----+---+
|           user_name|  h_s| m_s|sec|
+--------------------+-----+----+---+
|  sahil24c@gmail.com|18000|3000|  0|
|magadum.iranna@gm...|28800|2340| 59|
|  yathink3@gmail.com|32400|1440| 59|
|  shelkeva@gmail.com|32400| 600|  1|
|puruissimple@gmai...|25200|1140| 59|
|sangita.awaghad19...|28800|2400|  0|
|vaishusawant143@g...|32400|3300|  0|
|     you@example.com|28800| 240| 59|
|samadhanmahajan73...|28800|2340| 59|
|vishnu23kumar@gma...|21600| 600|  0|
|ashutoshrit64@gma...|36000|1500|  1|
|akshaybavalekar10...|28800|2100|  0|
|khairnarswapna99@...|32400|1500|  0|
|kukadeshilpaa7m95...|32400|2400|  0|
|sarikabarge111@gm...|28800|2700|  0|
|narsimharaj.kasu0...|28800| 300|  0|
|antonyalexcm@gmai...|36000| 900|  0|
|jitupatil937@gmai...|32400|1200|  0|
|akshaypatwari24@g...|32400| 300|  0|
|aheteshams007@gma...|28800|3000|  1|
+--------------------+-----+----+---+
only showing top 20 rows



In [21]:

df7 = df6.withColumn('working_hours', (df6['h_s'] + df6['m_s'] + df6['sec'])/3600)

In [22]:
df7.show()

+--------------------+-----+----+---+------------------+
|           user_name|  h_s| m_s|sec|     working_hours|
+--------------------+-----+----+---+------------------+
|  sahil24c@gmail.com|18000|3000|  0| 5.833333333333333|
|magadum.iranna@gm...|28800|2340| 59|  8.66638888888889|
|  yathink3@gmail.com|32400|1440| 59|  9.41638888888889|
|  shelkeva@gmail.com|32400| 600|  1| 9.166944444444445|
|puruissimple@gmai...|25200|1140| 59| 7.333055555555555|
|sangita.awaghad19...|28800|2400|  0| 8.666666666666666|
|vaishusawant143@g...|32400|3300|  0| 9.916666666666666|
|     you@example.com|28800| 240| 59| 8.083055555555555|
|samadhanmahajan73...|28800|2340| 59|  8.66638888888889|
|vishnu23kumar@gma...|21600| 600|  0| 6.166666666666667|
|ashutoshrit64@gma...|36000|1500|  1|10.416944444444445|
|akshaybavalekar10...|28800|2100|  0| 8.583333333333334|
|khairnarswapna99@...|32400|1500|  0| 9.416666666666666|
|kukadeshilpaa7m95...|32400|2400|  0| 9.666666666666666|
|sarikabarge111@gm...|28800|270

In [23]:
####calculating average hours
average = df7.select(avg('working_hours'))
average.show()


+------------------+
|avg(working_hours)|
+------------------+
| 7.137449494949498|
+------------------+



In [24]:
df7.dtypes

[('user_name', 'string'),
 ('h_s', 'int'),
 ('m_s', 'int'),
 ('sec', 'int'),
 ('working_hours', 'double')]

In [25]:
df7.filter(df7['working_hours'] < 7.137449494949498).select('user_name').show()

+--------------------+
|           user_name|
+--------------------+
|  sahil24c@gmail.com|
|vishnu23kumar@gma...|
|sargampandey27oct...|
|ayush.saraf47@gma...|
|mr.kundare@gmail.com|
|ruchikachile30199...|
|“shivnajalisangal...|
| addyp1911@gmail.com|
|dipakalagate1991@...|
|gaikwadr576@gmail...|
|tekina.makin@gmai...|
|mishrasushil889@g...|
| blsonalib@gmail.com|
|hakepratiksha55@g...|
|vaibhavpratihar17...|
|bsaivenkatavikas@...|
| youremail@email.com|
|polelaxman001@gma...|
|er.mukulvij96@gma...|
| dileep.bs@yahoo.com|
+--------------------+
only showing top 20 rows

