In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions
from pyspark.sql.types import IntegerType
from pyspark.sql import Window

In [2]:
from pyspark.sql.functions import *

# Tiger analytics interview questions and answers in pyspark
#### Find the Origin and destination of each customer.
Note: There can be more than one stop for the customer journey

In [3]:
data = [
        (1,"flight1","Delhi","Hyderabad"),
        (1,"flight2","Hyderabad","Kochi"),
        (1,"flight3","Kochi","Mangalore"),
        (2,"flight1","Mumbai","Ayudhya"),
        (2,"flight2","Ayudhya","Gorakhpur")
       ]

In [4]:
spark= SparkSession.builder.master("local[2]").appName("DataSet").getOrCreate()

In [5]:
_schema = ['cust_id', 'flight', 'origin', 'destination']
dataframe = spark.createDataFrame(data=data,schema = _schema)
dataframe.show()

+-------+-------+---------+-----------+
|cust_id| flight|   origin|destination|
+-------+-------+---------+-----------+
|      1|flight1|    Delhi|  Hyderabad|
|      1|flight2|Hyderabad|      Kochi|
|      1|flight3|    Kochi|  Mangalore|
|      2|flight1|   Mumbai|    Ayudhya|
|      2|flight2|  Ayudhya|  Gorakhpur|
+-------+-------+---------+-----------+



### Generate Row Number for each customer and flight set

In [6]:
df1 = dataframe.withColumn("RN",functions.row_number().over(Window.partitionBy(functions.col("cust_id")).orderBy(functions.col("flight"))))

In [7]:
df1.show()

+-------+-------+---------+-----------+---+
|cust_id| flight|   origin|destination| RN|
+-------+-------+---------+-----------+---+
|      1|flight1|    Delhi|  Hyderabad|  1|
|      1|flight2|Hyderabad|      Kochi|  2|
|      1|flight3|    Kochi|  Mangalore|  3|
|      2|flight1|   Mumbai|    Ayudhya|  1|
|      2|flight2|  Ayudhya|  Gorakhpur|  2|
+-------+-------+---------+-----------+---+



## Find Max and minimum

In [8]:
df2 = df1.groupBy("cust_id").agg(functions.min("RN").alias("origin_num"),functions.max("RN").alias("destination_num"))

In [9]:
df2.show()

+-------+----------+---------------+
|cust_id|origin_num|destination_num|
+-------+----------+---------------+
|      1|         1|              3|
|      2|         1|              2|
+-------+----------+---------------+



In [10]:
joined_df = df1.join(df2, df1.cust_id == df2.cust_id, 'inner').drop(df1.cust_id)

In [11]:
joined_df.show()

+-------+---------+-----------+---+-------+----------+---------------+
| flight|   origin|destination| RN|cust_id|origin_num|destination_num|
+-------+---------+-----------+---+-------+----------+---------------+
|flight3|    Kochi|  Mangalore|  3|      1|         1|              3|
|flight2|Hyderabad|      Kochi|  2|      1|         1|              3|
|flight1|    Delhi|  Hyderabad|  1|      1|         1|              3|
|flight2|  Ayudhya|  Gorakhpur|  2|      2|         1|              2|
|flight1|   Mumbai|    Ayudhya|  1|      2|         1|              2|
+-------+---------+-----------+---+-------+----------+---------------+



## Final Answer

In [12]:
final_df = joined_df.groupBy(col("cust_id")).agg(
    max(when(col("RN") == col("origin_num"),col("origin"))).alias("origin"),
    max(when(col("RN") == col("destination_num"),col("destination"))).alias("destination")
)
final_df.show()

+-------+------+-----------+
|cust_id|origin|destination|
+-------+------+-----------+
|      1| Delhi|  Mangalore|
|      2|Mumbai|  Gorakhpur|
+-------+------+-----------+



# Method-2

In [13]:
dataframe.show()

+-------+-------+---------+-----------+
|cust_id| flight|   origin|destination|
+-------+-------+---------+-----------+
|      1|flight1|    Delhi|  Hyderabad|
|      1|flight2|Hyderabad|      Kochi|
|      1|flight3|    Kochi|  Mangalore|
|      2|flight1|   Mumbai|    Ayudhya|
|      2|flight2|  Ayudhya|  Gorakhpur|
+-------+-------+---------+-----------+



In [14]:
df_row = dataframe.withColumn("RN",row_number().over(Window.partitionBy(col("cust_id")).orderBy(col("flight"))))
df_row.show()

+-------+-------+---------+-----------+---+
|cust_id| flight|   origin|destination| RN|
+-------+-------+---------+-----------+---+
|      1|flight1|    Delhi|  Hyderabad|  1|
|      1|flight2|Hyderabad|      Kochi|  2|
|      1|flight3|    Kochi|  Mangalore|  3|
|      2|flight1|   Mumbai|    Ayudhya|  1|
|      2|flight2|  Ayudhya|  Gorakhpur|  2|
+-------+-------+---------+-----------+---+



In [15]:
df22 = df_row.groupBy("cust_id").agg(min("RN").alias("start"),max("RN").alias("end")).select(col('start'),col('end'),col('cust_id').alias('id'))
df22.show()

+-----+---+---+
|start|end| id|
+-----+---+---+
|    1|  3|  1|
|    1|  2|  2|
+-----+---+---+



In [16]:
final_data = df_row.join(df22,df_row.cust_id == df22.id).\
withColumn('origin',when((col('cust_id') == col('id')) & (col('RN') == col('start')),col('origin'))).\
withColumn('destination',when((col('cust_id') == col('id')) & (col('RN') == col('end')),col('destination'))).\
filter(col("origin").isNotNull() | col("destination").isNotNull())

final_data.show()
# final_data.drop(df22.cust_id).withColumn("origin",when(col("start") == 

+-------+-------+------+-----------+---+-----+---+---+
|cust_id| flight|origin|destination| RN|start|end| id|
+-------+-------+------+-----------+---+-----+---+---+
|      1|flight3|  NULL|  Mangalore|  3|    1|  3|  1|
|      1|flight1| Delhi|       NULL|  1|    1|  3|  1|
|      2|flight2|  NULL|  Gorakhpur|  2|    1|  2|  2|
|      2|flight1|Mumbai|       NULL|  1|    1|  2|  2|
+-------+-------+------+-----------+---+-----+---+---+



In [17]:
# final_data.select("cust_id","origin","destination").distinct().show()
final_data.show()

+-------+-------+------+-----------+---+-----+---+---+
|cust_id| flight|origin|destination| RN|start|end| id|
+-------+-------+------+-----------+---+-----+---+---+
|      1|flight3|  NULL|  Mangalore|  3|    1|  3|  1|
|      1|flight1| Delhi|       NULL|  1|    1|  3|  1|
|      2|flight2|  NULL|  Gorakhpur|  2|    1|  2|  2|
|      2|flight1|Mumbai|       NULL|  1|    1|  2|  2|
+-------+-------+------+-----------+---+-----+---+---+



In [18]:
final_data.groupBy('cust_id').agg(max(when(col('RN')==col('start'),col('origin'))).alias('origin'),
                                  max(when(col('RN') ==col('end'),col('destination'))).alias('destination')).show()

+-------+------+-----------+
|cust_id|origin|destination|
+-------+------+-----------+
|      1| Delhi|  Mangalore|
|      2|Mumbai|  Gorakhpur|
+-------+------+-----------+



In [19]:
final_data.withColumn("")

+-------+-------+------+-----------+---+-----+---+---+
|cust_id| flight|origin|destination| RN|start|end| id|
+-------+-------+------+-----------+---+-----+---+---+
|      1|flight3|  NULL|  Mangalore|  3|    1|  3|  1|
|      1|flight1| Delhi|       NULL|  1|    1|  3|  1|
|      2|flight2|  NULL|  Gorakhpur|  2|    1|  2|  2|
|      2|flight1|Mumbai|       NULL|  1|    1|  2|  2|
+-------+-------+------+-----------+---+-----+---+---+

