In [28]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col,when,concat,length,row_number
from pyspark.sql import Window as WN
from pyspark.sql import types as TP


In [2]:
spark = SparkSession.builder.appName("REDUNTANT_PAIR").master("local[2]").getOrCreate()

In [3]:
spark

In [4]:
data = [
    ('apple', 'samsung', 2020, 1, 2, 1, 2),
    ('samsung', 'apple', 2020, 1, 2, 1, 2),
    ('apple', 'samsung', 2021, 1, 2, 5, 3),
    ('samsung', 'apple', 2021, 5, 3, 1, 2),
    ('google', None, 2020, 5, 9, None, None),
    ('oneplus', 'nothing', 2020, 5, 9, 6, 3)
]
schema = 'brand1 string , brand2 string , year int , custom1 int, custom2 int , custom3 int , custom4 int'

df = spark.createDataFrame(data = data , schema = schema)
df.show()

+-------+-------+----+-------+-------+-------+-------+
| brand1| brand2|year|custom1|custom2|custom3|custom4|
+-------+-------+----+-------+-------+-------+-------+
|  apple|samsung|2020|      1|      2|      1|      2|
|samsung|  apple|2020|      1|      2|      1|      2|
|  apple|samsung|2021|      1|      2|      5|      3|
|samsung|  apple|2021|      5|      3|      1|      2|
| google|   NULL|2020|      5|      9|   NULL|   NULL|
|oneplus|nothing|2020|      5|      9|      6|      3|
+-------+-------+----+-------+-------+-------+-------+



**For pairs of brands in the same year<br>
(e.g. apple/samsung/2020 and sample/apple/2020)<br>
IF custom1=custom3 and custom2 = custom4
then keep only one pair.<br>
For pairs of brands in the same year<br>
IF custom1 !=custom3 OR custom2 != custom4 then keep both<br>
For pairs of brands that do not have pairs in the same year<br>
Keep those rows as well
**

In [None]:
# Method-1

In [20]:
when_con = when( length(col('brand1')) > length(col('brand2')),concat(col('brand1'),col('brand2'),col('year'))).\
    otherwise(concat(col('brand2'),col('brand1'),col('year')))


In [25]:
df_pair = df.withColumn("pait_brand",when_con)

In [30]:
df_pair.show()

+-------+-------+----+-------+-------+-------+-------+------------------+
| brand1| brand2|year|custom1|custom2|custom3|custom4|        pait_brand|
+-------+-------+----+-------+-------+-------+-------+------------------+
|  apple|samsung|2020|      1|      2|      1|      2|  samsungapple2020|
|samsung|  apple|2020|      1|      2|      1|      2|  samsungapple2020|
|  apple|samsung|2021|      1|      2|      5|      3|  samsungapple2021|
|samsung|  apple|2021|      5|      3|      1|      2|  samsungapple2021|
| google|   NULL|2020|      5|      9|   NULL|   NULL|              NULL|
|oneplus|nothing|2020|      5|      9|      6|      3|oneplusnothing2020|
+-------+-------+----+-------+-------+-------+-------+------------------+



In [36]:
wind_cond = WN.partitionBy(col('pait_brand')).orderBy(col('year'))

partition_df = df_pair.withColumn("row_num",row_number().over(wind_cond))

In [38]:
partition_df.show()

+-------+-------+----+-------+-------+-------+-------+------------------+-------+
| brand1| brand2|year|custom1|custom2|custom3|custom4|        pait_brand|row_num|
+-------+-------+----+-------+-------+-------+-------+------------------+-------+
| google|   NULL|2020|      5|      9|   NULL|   NULL|              NULL|      1|
|oneplus|nothing|2020|      5|      9|      6|      3|oneplusnothing2020|      1|
|  apple|samsung|2020|      1|      2|      1|      2|  samsungapple2020|      1|
|samsung|  apple|2020|      1|      2|      1|      2|  samsungapple2020|      2|
|  apple|samsung|2021|      1|      2|      5|      3|  samsungapple2021|      1|
|samsung|  apple|2021|      5|      3|      1|      2|  samsungapple2021|      2|
+-------+-------+----+-------+-------+-------+-------+------------------+-------+



In [46]:
partition_df.select("brand1","brand2","year","custom1","custom2","custom3","custom4").\
filter((col('row_num') == 1) | ((col('custom1') != col('custom3')) | (col('custom2') != col('custom4')))).show()

+-------+-------+----+-------+-------+-------+-------+
| brand1| brand2|year|custom1|custom2|custom3|custom4|
+-------+-------+----+-------+-------+-------+-------+
| google|   NULL|2020|      5|      9|   NULL|   NULL|
|oneplus|nothing|2020|      5|      9|      6|      3|
|  apple|samsung|2020|      1|      2|      1|      2|
|  apple|samsung|2021|      1|      2|      5|      3|
|samsung|  apple|2021|      5|      3|      1|      2|
+-------+-------+----+-------+-------+-------+-------+



In [57]:
df.createOrReplaceTempView("brand_table2")

In [73]:
# Using SQL

In [58]:
spark.sql("select * from brand_table2").show()

+-------+-------+----+-------+-------+-------+-------+
| brand1| brand2|year|custom1|custom2|custom3|custom4|
+-------+-------+----+-------+-------+-------+-------+
|  apple|samsung|2020|      1|      2|      1|      2|
|samsung|  apple|2020|      1|      2|      1|      2|
|  apple|samsung|2021|      1|      2|      5|      3|
|samsung|  apple|2021|      5|      3|      1|      2|
| google|   NULL|2020|      5|      9|   NULL|   NULL|
|oneplus|nothing|2020|      5|      9|      6|      3|
+-------+-------+----+-------+-------+-------+-------+



In [72]:
sql_statement = """
WITH DATA AS
(
SELECT brand1,brand2,year,custom1,custom2,custom3,custom4,
CASE
WHEN LEN(brand1) > LEN(brand2) THEN CONCAT(brand1,brand2,year)
ELSE
CONCAT(brand1,brand2,year)
END AS PAIRS
FROM brand_table2
), filter_data AS(
SELECT brand1,brand2,year,custom1,custom2,custom3,custom4,PAIRS,
ROW_NUMBER() OVER(PARTITION BY PAIRS ORDER BY year) as RW1
from DATA
)
SELECT brand1,brand2,year,custom1,custom2,custom3,custom4
FROM filter_data WHERE RW1 =1 OR (custom1 != custom3 AND custom2 !=custom4)
"""
spark.sql(sql_statement).show()

+-------+-------+----+-------+-------+-------+-------+
| brand1| brand2|year|custom1|custom2|custom3|custom4|
+-------+-------+----+-------+-------+-------+-------+
| google|   NULL|2020|      5|      9|   NULL|   NULL|
|  apple|samsung|2020|      1|      2|      1|      2|
|  apple|samsung|2021|      1|      2|      5|      3|
|oneplus|nothing|2020|      5|      9|      6|      3|
|samsung|  apple|2020|      1|      2|      1|      2|
|samsung|  apple|2021|      5|      3|      1|      2|
+-------+-------+----+-------+-------+-------+-------+

