In [1]:
from pyspark.sql.functions import hour,minute,second,col,avg,when
from pyspark.sql import SQLContext
import pyspark.sql.functions as sql_functions
import datetime
sqlContext = SQLContext(sc)

In [2]:
df = sqlContext.read.csv("hdfs://localhost:54310/user/hduser/user_log_data.csv",header = True, inferSchema = True)

In [3]:
df1 = df.withColumn('hours', hour(df['working_hours']))

In [4]:
x = df1.select('user_name','hours')
x.show(2)

+--------------------+-----+
|           user_name|hours|
+--------------------+-----+
|  sahil24c@gmail.com|    5|
|magadum.iranna@gm...|    8|
+--------------------+-----+
only showing top 2 rows



In [5]:
late_hours = df.filter(sql_functions.col('start_time') > '2019-10-24 09:30:02').select('user_name','start_time')
late_hours.show(2)

+-------------------+-------------------+
|          user_name|         start_time|
+-------------------+-------------------+
| rr582619@gmail.con|2019-10-24 09:35:01|
|addyp1911@gmail.com|2019-10-24 09:35:02|
+-------------------+-------------------+
only showing top 2 rows



In [6]:

x = x.withColumn('h_s', x['hours'] * 3600)
x.show(2)

+--------------------+-----+-----+
|           user_name|hours|  h_s|
+--------------------+-----+-----+
|  sahil24c@gmail.com|    5|18000|
|magadum.iranna@gm...|    8|28800|
+--------------------+-----+-----+
only showing top 2 rows



In [7]:
df2 = df.withColumn('min', minute(df['working_hours']))

In [8]:

y = df2.select('user_name', 'min')

In [9]:

y = y.withColumn('m_s', y['min'] * 60)
y.show(2)

+--------------------+---+----+
|           user_name|min| m_s|
+--------------------+---+----+
|  sahil24c@gmail.com| 50|3000|
|magadum.iranna@gm...| 39|2340|
+--------------------+---+----+
only showing top 2 rows



In [10]:
df3 = df.withColumn('sec', second(df['working_hours']))

In [11]:

z = df3.select('user_name', 'sec')
z.show(2)

+--------------------+---+
|           user_name|sec|
+--------------------+---+
|  sahil24c@gmail.com|  0|
|magadum.iranna@gm...| 59|
+--------------------+---+
only showing top 2 rows



In [12]:
#Joining the dataframes
df4 = x.join(y, on = ['user_name'], how = 'inner')
df4.show(2)

+--------------------+-----+-----+---+----+
|           user_name|hours|  h_s|min| m_s|
+--------------------+-----+-----+---+----+
|  sahil24c@gmail.com|    5|18000| 50|3000|
|magadum.iranna@gm...|    8|28800| 39|2340|
+--------------------+-----+-----+---+----+
only showing top 2 rows



In [13]:

df5 = df4.join(z, on = ['user_name'], how = 'inner') 
df5.show(2)

+--------------------+-----+-----+---+----+---+
|           user_name|hours|  h_s|min| m_s|sec|
+--------------------+-----+-----+---+----+---+
|  sahil24c@gmail.com|    5|18000| 50|3000|  0|
|magadum.iranna@gm...|    8|28800| 39|2340| 59|
+--------------------+-----+-----+---+----+---+
only showing top 2 rows



In [14]:

df6 = df5.drop('hours', 'min')
df6.show(2)

+--------------------+-----+----+---+
|           user_name|  h_s| m_s|sec|
+--------------------+-----+----+---+
|  sahil24c@gmail.com|18000|3000|  0|
|magadum.iranna@gm...|28800|2340| 59|
+--------------------+-----+----+---+
only showing top 2 rows



In [15]:
df7 = df6.withColumn('working_hours', (df6['h_s'] + df6['m_s'] + df6['sec'])/3600)
df7.show(2)

+--------------------+-----+----+---+-----------------+
|           user_name|  h_s| m_s|sec|    working_hours|
+--------------------+-----+----+---+-----------------+
|  sahil24c@gmail.com|18000|3000|  0|5.833333333333333|
|magadum.iranna@gm...|28800|2340| 59| 8.66638888888889|
+--------------------+-----+----+---+-----------------+
only showing top 2 rows



In [16]:
#calculating average hours
average = df7.select(avg('working_hours'))
average.show()

+------------------+
|avg(working_hours)|
+------------------+
| 7.137449494949498|
+------------------+



In [17]:
avg_count = df7.filter(sql_functions.col("working_hours") > 7.137449494949498)
avg_count.count()

57

In [18]:
df7.filter(df7['working_hours'] > 7.137449494949498).select('user_name').show()

+--------------------+
|           user_name|
+--------------------+
|magadum.iranna@gm...|
|  yathink3@gmail.com|
|  shelkeva@gmail.com|
|puruissimple@gmai...|
|sangita.awaghad19...|
|vaishusawant143@g...|
|     you@example.com|
|samadhanmahajan73...|
|ashutoshrit64@gma...|
|akshaybavalekar10...|
|khairnarswapna99@...|
|kukadeshilpaa7m95...|
|sarikabarge111@gm...|
|narsimharaj.kasu0...|
|antonyalexcm@gmai...|
|jitupatil937@gmai...|
|akshaypatwari24@g...|
|aheteshams007@gma...|
|surajpj7852@gmail...|
|kaleshwetaanil@gm...|
+--------------------+
only showing top 20 rows

