In [0]:
from pyspark.sql.types import *

from pyspark import SparkContext
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("my project 1").getOrCreate()
sc = spark.sparkContext

# Read a CSV into a dataframe
# There is a smarter version, that will first check if there is a Parquet file and use it
def load_PD_file(filename_or_dir, schema) :
    dataPath = "/mnt/ddscoursedatastorage/fwm-stb-data/" + filename_or_dir
    df = spark.read.format("csv")\
      .option("header","false")\
      .option("delimiter", "|")\
      .schema(schema)\
      .load(dataPath)
    return df

In [0]:
df1 = spark.read.csv("/mnt/ddscoursedatastorage/dds-students/test.csv")

In [0]:

# Reading the Reference Parquet files

ref_data = spark.read.parquet('/ref_data_raw').withColumnRenamed("_device-id","device_id")\
                                                .withColumnRenamed("_dma","dma")\
                                                .withColumnRenamed("_dma-code","dma_code")\
                                                .withColumnRenamed("_household-id","household_id")\
                                                .withColumnRenamed("_household-type","household_type")\
                                                .withColumnRenamed("_system-type","system_type")\
                                                .withColumnRenamed("_zipcode","zipcode")
ref_data_count = ref_data.count()
print(type(ref_data))






<class 'pyspark.sql.dataframe.DataFrame'>


In [0]:

# Reading the Daily Programs CSV file

daily_prog_schema =  StructType([StructField('prog_code',StringType()),
                     StructField('title',StringType()),
                     StructField('genre',StringType()),
                     StructField('air_date',StringType()),
                     StructField('air_time',StringType()),
                     StructField('Duration',FloatType())
                                       ])
daily_prog_data = load_PD_file("Daily program data/" , daily_prog_schema  )
print(type(daily_prog_data))


<class 'pyspark.sql.dataframe.DataFrame'>


In [0]:
 #Reading the 2.5% sample of the viewing data from a Parquet file
 
viewing_data = spark.read.parquet('/sample_viewing_2_5percent')
 
print(f'There are {viewing_data.count():,} entries in viewing_data dataframe!')



There are 130,289,194 entries in viewing_data dataframe!


In [0]:
# Reading the Demographic CSV file

demographic_schema =  StructType([StructField('household_id',StringType()),
                      StructField('household_size',IntegerType()),
                      StructField('num_adults',IntegerType()),
                      StructField('num_generations',IntegerType()),
                      StructField('adult_range',StringType()),
                      StructField('marital_status',StringType()),
                      StructField('race_code',StringType()),
                      StructField('presence_children',StringType()),
                      StructField('num_children',IntegerType()),
                      StructField('age_children',StringType()), #format like range - 'bitwise'
                      StructField('age_range_children',StringType()),
                      StructField('dwelling_type',StringType()),
                      StructField('home_owner_status',StringType()),
                      StructField('length_residence',IntegerType()),
                      StructField('home_market_value',StringType()),
                      StructField('num_vehicles',IntegerType()),
                      StructField('vehicle_make',StringType()),
                      StructField('vehicle_model',StringType()),
                      StructField('vehicle_year',IntegerType()),
                      StructField('net_worth',IntegerType()),
                      StructField('income',StringType()),
                      StructField('gender_individual',StringType()),
                      StructField('age_individual',IntegerType()),
                      StructField('education_highest',StringType()),
                      StructField('occupation_highest',StringType()),
                      StructField('education_1',StringType()),
                      StructField('occupation_1',StringType()),
                      StructField('age_2',IntegerType()),
                      StructField('education_2',StringType()),
                      StructField('occupation_2',StringType()),
                      StructField('age_3',IntegerType()),
                      StructField('education_3',StringType()),
                      StructField('occupation_3',StringType()),
                      StructField('age_4',IntegerType()),
                      StructField('education_4',StringType()),
                      StructField('occupation_4',StringType()),
                      StructField('age_5',IntegerType()),
                      StructField('education_5',StringType()),
                      StructField('occupation_5',StringType()),
                      StructField('polit_party_regist',StringType()),
                      StructField('polit_party_input',StringType()),
                      StructField('household_clusters',StringType()),
                      StructField('insurance_groups',StringType()),
                      StructField('financial_groups',StringType()),
                      StructField('green_living',StringType())
                                       ])

demographic_data = load_PD_file("demographic/" , demographic_schema  )  
selected_demographic_data_df=demographic_data.select("household_id","household_size","num_adults","net_worth","income").dropDuplicates().cache()



In [0]:
from pyspark.sql import Window 
from pyspark.sql.functions import col, count, explode, split
# Creating the relevant relation
daily_df = daily_prog_data.select("prog_code", "genre").dropDuplicates()
# Spliting genres such that each only one genre per record: 
daily_df_split_genres = daily_df.withColumn("genre", explode(split("genre", ",")))
ref = ref_data.select(['device_id','dma']).na.drop().dropDuplicates()
viewing = viewing_data.select("device_id", "prog_code", "event_time", "event_date").dropDuplicates() 
#join viewing data and ref
data_join_ref = viewing.join(daily_df_split_genres, on="prog_code", how="inner")
data_join_ref = data_join_ref.join(ref, on="device_id", how="inner").cache()

In [0]:
 from pyspark.sql import Window 
 from pyspark.sql import functions as F

 from pyspark.sql.functions import col, count, desc, concat_ws, dense_rank,collect_list

genre_entry_count_df = data_join_ref.groupBy("dma", "genre").agg(F.count("prog_code").alias("Entry_Count"))

# Creating a window  partitioned by 'dma' and ordering it  by the Entry_Count in descending order
windowSpec = Window.partitionBy("dma").orderBy(F.desc("Entry_Count"))

# Add a row_number column to the data frame that will count the row number
genre_counts_with_row_number = genre_entry_count_df.withColumn("row_number", F.row_number().over(windowSpec))

# Filtering the DataFrame genre_counts_with_row_number to keep only the genres with row numbers <= 5
top5_genres_in_dma = genre_counts_with_row_number.filter(F.col("row_number") <= 5)

# Drop the row number column
top5_genres_in_dma = top5_genres_in_dma.drop(*["row_number"])

# Extract distinct genres from the daily_df_split_genres DataFrame
all_genres = daily_df_split_genres.select("genre").distinct()

# Extract distinct DMAs from the ref DataFrame
all_dma = ref.select("dma").distinct()

# Create a cross join between all DMAs and all genres
cross = all_dma.crossJoin(all_genres)

# Join the cross join result with the top 5 genres in each DMA DataFrame
final_res = cross.join(top5_genres_in_dma, on=["dma", "genre"], how="left")

# Replace null values with 0
final_res = final_res.fillna(0)

# Order the final result by DMA and Entry_Count in descending order
final_res = final_res.orderBy("dma", desc("Entry_Count"))

# Cache the final result DataFrame for better performance
final_res = final_res.cache()

# Group the final result by DMA and collect the genres into a list
final_res = final_res.groupBy("dma").agg(collect_list("genre").alias("genre_list"))


for i in['Waco-Temple-Bryan','New York', 'Washington, DC (Hagrstwn)' ]:
    top5_genres_in_dma.filter(col("DMA")==i).show()


+-----------------+-------+-----------+
|              dma|  genre|Entry_Count|
+-----------------+-------+-----------+
|Waco-Temple-Bryan|Reality|     317141|
|Waco-Temple-Bryan|   News|     294302|
|Waco-Temple-Bryan|   Talk|     191523|
|Waco-Temple-Bryan| Sitcom|     184331|
|Waco-Temple-Bryan|  Drama|     159818|
+-----------------+-------+-----------+

+--------+-----------+-----------+
|     dma|      genre|Entry_Count|
+--------+-----------+-----------+
|New York|       News|     260400|
|New York|    Reality|     207070|
|New York|       Talk|     138032|
|New York|     Sitcom|     124509|
|New York|Documentary|     105943|
+--------+-----------+-----------+

+--------------------+--------+-----------+
|                 dma|   genre|Entry_Count|
+--------------------+--------+-----------+
|Washington, DC (H...| Reality|     309259|
|Washington, DC (H...|    News|     275033|
|Washington, DC (H...|  Sitcom|     217373|
|Washington, DC (H...|  Comedy|     182383|
|Washington, DC

In [0]:
from pyspark.sql import functions
display(final_res )



dma,genre_list
Amarillo,"List(Talk, Sitcom, Drama, Hockey, Bullfighting, Bus./financial, Game show, Hydroplane racing, Entertainment, Canoe, Horse, Tennis, Suspense, Spanish, Documentary, Science, Cooking, Holiday-children special, Nature, Weather, Theater, Religious, Children-special, Agriculture, Track/field, Special, Standup, Beach volleyball, Gaming, Crime drama, Beach soccer, House/garden, Music, Water polo, Mountain biking, Bicycle racing, Auction, Politics, Sumo wrestling, Swimming, Motorsports, Ballet, Newsmagazine, Fishing, Action, How-to, Anthology, Biography, Animated, Consumer, Variety, Animals, Table tennis, Snowboarding, Parenting, Poker, Rodeo, Rowing, Field hockey, Figure skating, Dog show, Skateboarding, Drag racing, Holiday, Bobsled, Gymnastics, Holiday music special, Mystery, Music talk, Soccer, Children, Soap, Rugby, Basketball, Computers, Billiards, Surfing, Arts/crafts, Playoff sports, Lacrosse, Musical, Educational, Parade, Bicycle, Music special, Pool, Speed skating, Historical drama, Travel, Running, Gay/lesbian, Sailing, Home improvement, Romance, Adventure, Triathlon, Intl soccer, Miniseries, Skiing, Fundraiser, Bowling, Debate, Dog racing, Golf, Pro wrestling, Comedy-drama, Science fiction, Aerobics, Boat, Darts, Auto racing, Environment, Holiday-children, Racquet, Military, Sports non-event, Volleyball, Arm wrestling, Community, Dance, Snowmobile, Aviation, Equestrian, Martial arts, Wrestling, Boxing, Fashion, Holiday special, Baseball, Paranormal, Anime, Collectibles, Badminton, Outdoors, Sports event, Kayaking, Performing arts, Watersports, Handball, Law, Health, Olympics, Public affairs, Sports talk, Water skiing, Art, Children-music, Card games, Crime, War, Skating, Softball, Awards, Comedy, Musical comedy, Auto, Biathlon, Indoor soccer, Horror, Shopping, Skeleton, Shooting, Squash, Hunting, Football, Event, Intl hockey, Medical, Cricket, Self improvement, Fantasy, Motorcycle racing, Interview, Adults only, Children-talk, Yacht racing, Mixed martial arts, Fencing, Exercise, Boat racing, Luge, Curling, Weightlifting, Romance-comedy, History, Opera, Diving, Action sports, Archery, Polo, Docudrama, Soap talk, Technology, Gaelic football, Motorcycle, Cheerleading, Western, News, Reality)"
Augusta-Aiken,"List(Comedy, Sports event, Action, Consumer, Entertainment, Paranormal, Water polo, Horror, Dance, Holiday special, Darts, Children-talk, Fashion, Aviation, Fantasy, Anthology, Technology, Educational, Skeleton, Special, Skiing, Children, Medical, Snowmobile, Gymnastics, Soap talk, Performing arts, Rodeo, Game show, Romance, Music special, Miniseries, Gay/lesbian, Bobsled, Motorsports, Parenting, Romance-comedy, Sailing, Sports non-event, Beach volleyball, Docudrama, Olympics, Sumo wrestling, Soccer, Comedy-drama, Fencing, Boat racing, Table tennis, Running, Western, Golf, Badminton, Musical, Dog show, Community, Documentary, Arm wrestling, Politics, Health, Pool, Debate, Animals, Intl hockey, Shopping, Soap, Children-music, Gaming, Card games, Science fiction, Luge, Snowboarding, Motorcycle, Racquet, Bullfighting, Diving, Fundraiser, Baseball, Drama, Motorcycle racing, Weightlifting, Biography, Fishing, Boat, Dog racing, Science, Curling, Intl soccer, Ballet, Billiards, Religious, Sitcom, Anime, Nature, Shooting, War, Law, Pro wrestling, Art, Interview, Public affairs, Hunting, Hockey, Polo, Action sports, Tennis, Rowing, Music talk, Swimming, Biathlon, Suspense, Boxing, Cricket, Figure skating, Aerobics, Basketball, Newsmagazine, Spanish, Yacht racing, Animated, Auto, Wrestling, Horse, History, Holiday music special, Travel, Opera, Canoe, Mystery, Lacrosse, Military, Softball, Event, Hydroplane racing, Historical drama, How-to, Bicycle, Variety, Equestrian, Triathlon, Skating, Holiday-children special, Mixed martial arts, Exercise, Parade, Bowling, Agriculture, Kayaking, Archery, Bicycle racing, Track/field, Weather, Speed skating, Sports talk, Squash, Arts/crafts, Beach soccer, Gaelic football, Outdoors, Adults only, Watersports, Field hockey, Crime drama, Football, Martial arts, Children-special, Cooking, Indoor soccer, Skateboarding, Music, Cheerleading, Auction, Computers, Environment, Volleyball, Water skiing, Poker, Holiday-children, Bus./financial, Playoff sports, Adventure, Talk, Awards, House/garden, Theater, Self improvement, Auto racing, Drag racing, Rugby, Handball, Holiday, Home improvement, Collectibles, Standup, Surfing, Mountain biking, Crime, Musical comedy, Reality, News)"
"Bend, OR","List(News, Talk, Sitcom, Comedy, Equestrian, Rowing, Children-special, Cooking, Fashion, Fishing, Gaelic football, Science, Animals, Aviation, Drama, Gymnastics, Parenting, Agriculture, Paranormal, Swimming, Documentary, Crime, Dog racing, Watersports, Standup, Skating, Bullfighting, Cricket, Darts, Art, Technology, Educational, Entertainment, Canoe, Billiards, Mountain biking, Volleyball, Martial arts, Romance, Suspense, Event, Rugby, Snowboarding, Speed skating, Consumer, Horror, Special, Sports event, Crime drama, Card games, Awards, Lacrosse, Auto, Holiday special, Horse, Community, Olympics, Aerobics, Bobsled, Action sports, Track/field, Bowling, Comedy-drama, Luge, Beach soccer, Medical, Fantasy, Sailing, Bus./financial, Soap talk, Adults only, Boxing, Dance, Game show, Fundraiser, Music, Romance-comedy, Biathlon, Nature, Debate, Archery, Skiing, Western, Auto racing, Holiday-children, Basketball, Law, Sports non-event, How-to, Animated, House/garden, Music special, Politics, Home improvement, Outdoors, Water polo, Golf, Mystery, Biography, Yacht racing, Dog show, Playoff sports, Pro wrestling, Mixed martial arts, Self improvement, Music talk, Variety, Triathlon, Theater, Beach volleyball, Pool, Gay/lesbian, Bicycle, Wrestling, Arts/crafts, Children, Hunting, Parade, Sumo wrestling, Auction, Shooting, Environment, Intl soccer, Ballet, Hydroplane racing, Bicycle racing, Hockey, Collectibles, Opera, Performing arts, Running, Rodeo, Computers, Motorsports, War, Drag racing, Table tennis, Religious, Sports talk, Baseball, Tennis, Arm wrestling, Soccer, Softball, Squash, Diving, Field hockey, Figure skating, Newsmagazine, Docudrama, Holiday music special, Skateboarding, Children-music, Musical comedy, Badminton, Children-talk, Fencing, Gaming, Shopping, History, Intl hockey, Weightlifting, Public affairs, Skeleton, Football, Soap, Science fiction, Weather, Anime, Curling, Miniseries, Surfing, Holiday-children special, Water skiing, Anthology, Poker, Polo, Military, Historical drama, Interview, Racquet, Indoor soccer, Action, Motorcycle racing, Exercise, Handball, Travel, Boat, Cheerleading, Adventure, Boat racing, Motorcycle, Health, Spanish, Kayaking, Musical, Snowmobile, Holiday, Reality)"
Buffalo,"List(Crime drama, News, Reality, Action, Drama, Public affairs, Computers, Surfing, Table tennis, Bullfighting, Arts/crafts, Soap talk, Mountain biking, Agriculture, Debate, Miniseries, Sports talk, Field hockey, Holiday special, Lacrosse, Mystery, Diving, Sports non-event, Ballet, Documentary, Outdoors, Watersports, Gymnastics, Playoff sports, Politics, Arm wrestling, Environment, Kayaking, Motorcycle racing, Swimming, Cheerleading, Shooting, Handball, Comedy-drama, Musical, How-to, Technology, Anthology, Curling, Hydroplane racing, Figure skating, Track/field, Holiday-children special, Parenting, Hockey, Olympics, Boat, Dance, Event, Indoor soccer, Children-talk, Speed skating, Western, Wrestling, Holiday, Comedy, Weightlifting, Baseball, Motorcycle, Squash, Cooking, Musical comedy, Intl soccer, Home improvement, Religious, Bicycle, Animated, Water skiing, Card games, Talk, Polo, Adults only, Romance, Music talk, Poker, Children, Football, Tennis, Science fiction, War, Volleyball, Consumer, Interview, Animals, Auto racing, Children-music, Adventure, Intl hockey, Yacht racing, Horror, Law, Skating, Equestrian, Fantasy, Shopping, Skateboarding, Travel, Military, Horse, Weather, Health, Special, Beach soccer, House/garden, Racquet, Motorsports, Music, Darts, Snowboarding, Community, Gay/lesbian, Running, Triathlon, Anime, Holiday-children, Archery, Pro wrestling, Exercise, Martial arts, Entertainment, Boat racing, Gaelic football, Self improvement, Aviation, Suspense, Educational, Gaming, Bicycle racing, Bus./financial, Fundraiser, Drag racing, Rugby, Art, Auto, Mixed martial arts, Parade, Fishing, Nature, Soap, Action sports, Dog racing, Historical drama, Luge, Opera, Sitcom, Variety, Performing arts, Science, Beach volleyball, Dog show, Canoe, Children-special, Pool, Fashion, Snowmobile, Bowling, Soccer, Skiing, Spanish, Medical, Skeleton, Bobsled, Hunting, Romance-comedy, Softball, Paranormal, Rodeo, Sports event, Standup, History, Theater, Fencing, Biography, Docudrama, Game show, Basketball, Holiday music special, Aerobics, Golf, Music special, Cricket, Collectibles, Biathlon, Badminton, Water polo, Awards, Billiards, Auction, Boxing, Newsmagazine, Sailing, Rowing, Crime, Sumo wrestling)"
Charleston-Huntington,"List(News, Sitcom, Drama, Talk, Politics, How-to, Playoff sports, Variety, Polo, Comedy-drama, Musical comedy, Auto racing, Mystery, Rugby, Billiards, House/garden, Sailing, Children-special, Olympics, Soap talk, Travel, Darts, Environment, Shooting, Awards, Watersports, Action sports, Interview, Running, Documentary, Horror, Military, Racquet, Event, Educational, Water skiing, Aviation, Track/field, War, Anthology, Biography, Theater, Game show, Romance, Religious, Gay/lesbian, Western, Weightlifting, Boat, Mountain biking, Squash, Fundraiser, Technology, Lacrosse, Crime, Softball, Auto, Debate, Motorcycle, Skiing, Anime, Curling, Sports talk, Science fiction, Volleyball, Horse, Holiday music special, Pro wrestling, Wrestling, Arm wrestling, Intl soccer, Dog show, Surfing, Hunting, Hockey, Motorsports, Community, Entertainment, Triathlon, Luge, Tennis, Action, Indoor soccer, Biathlon, Dance, Snowmobile, Performing arts, Sports non-event, Golf, Skateboarding, Diving, Music special, Children-talk, Health, Arts/crafts, Children, Mixed martial arts, Drag racing, Badminton, Parenting, Ballet, Holiday, Adventure, Pool, Soccer, Home improvement, Water polo, Intl hockey, Spanish, Bus./financial, Skeleton, History, Standup, Beach soccer, Table tennis, Historical drama, Romance-comedy, Cricket, Gaming, Medical, Self improvement, Agriculture, Kayaking, Outdoors, Bullfighting, Holiday-children special, Holiday special, Boxing, Boat racing, Motorcycle racing, Poker, Weather, Soap, Basketball, Bicycle, Animals, Baseball, Comedy, Art, Skating, Computers, Equestrian, Rowing, Collectibles, Canoe, Fantasy, Fishing, Crime drama, Newsmagazine, Animated, Miniseries, Science, Children-music, Hydroplane racing, Martial arts, Docudrama, Fashion, Holiday-children, Swimming, Aerobics, Figure skating, Dog racing, Football, Gaelic football, Beach volleyball, Auction, Bobsled, Cheerleading, Public affairs, Shopping, Yacht racing, Musical, Suspense, Parade, Bicycle racing, Consumer, Music talk, Cooking, Exercise, Adults only, Card games, Sports event, Archery, Opera, Music, Rodeo, Snowboarding, Bowling, Field hockey, Special, Paranormal, Nature, Law, Speed skating, Gymnastics, Handball, Sumo wrestling, Fencing, Reality)"
Cincinnati,"List(News, Reality, Sitcom, Talk, Comedy, Hockey, Soap, Anime, Kayaking, Musical comedy, Bicycle racing, Fishing, Skateboarding, Holiday-children, Crime, Holiday, Children-music, Event, Performing arts, Documentary, Holiday-children special, Rowing, Yacht racing, Auto racing, Motorcycle, Motorcycle racing, Soccer, Diving, Travel, Baseball, Dog show, Educational, Science fiction, Home improvement, Poker, Children-talk, Figure skating, Music talk, Religious, Volleyball, Dog racing, Miniseries, Bicycle, Entertainment, Gaelic football, Hydroplane racing, Interview, Skeleton, Fashion, Canoe, Golf, Running, Boat racing, Nature, Weather, Billiards, Sailing, Cheerleading, Hunting, Olympics, Polo, Crime drama, Dance, Archery, Luge, Wrestling, Auto, Sports talk, Music special, Action sports, Pro wrestling, Animals, Beach volleyball, Field hockey, Playoff sports, Boxing, Bus./financial, Drag racing, Skiing, Bowling, Suspense, Fantasy, Spanish, Cricket, Consumer, Mystery, Shopping, Animated, Holiday special, Mixed martial arts, Horror, How-to, Intl soccer, Exercise, Aviation, Biathlon, House/garden, Ballet, Equestrian, Self improvement, Opera, Politics, Romance, Soap talk, Public affairs, Action, Law, Boat, Card games, Fencing, Military, Children-special, Intl hockey, Awards, Historical drama, Basketball, Water polo, Docudrama, Music, Sports non-event, Holiday music special, Romance-comedy, History, Curling, Sumo wrestling, Football, Adults only, Adventure, Agriculture, Environment, Drama, Newsmagazine, Collectibles, Anthology, Track/field, War, Fundraiser, Horse, Darts, Game show, Variety, Aerobics, Parade, Paranormal, Water skiing, Special, Shooting, Badminton, Medical, Technology, Racquet, Weightlifting, Science, Children, Sports event, Table tennis, Western, Gaming, Lacrosse, Softball, Arts/crafts, Computers, Gay/lesbian, Martial arts, Speed skating, Parenting, Auction, Bullfighting, Squash, Comedy-drama, Rodeo, Swimming, Debate, Mountain biking, Skating, Watersports, Indoor soccer, Handball, Standup, Cooking, Snowboarding, Motorsports, Pool, Outdoors, Rugby, Bobsled, Arm wrestling, Beach soccer, Gymnastics, Surfing, Snowmobile, Art, Biography, Musical, Theater, Health, Tennis, Community, Triathlon)"
Dallas-Ft. Worth,"List(Talk, Sitcom, Drama, Adults only, Fashion, Self improvement, Romance, Bowling, Martial arts, Soap, Law, Variety, Racquet, Standup, Cheerleading, Sailing, Technology, Cricket, Kayaking, Skateboarding, Table tennis, Track/field, Musical comedy, Rugby, Speed skating, Parenting, Holiday, Mountain biking, Theater, Arts/crafts, Paranormal, Holiday-children special, Public affairs, Rodeo, Water skiing, Biography, Boat, Intl hockey, Gymnastics, Weather, Miniseries, Exercise, Travel, Crime drama, Basketball, Entertainment, Computers, Opera, Shooting, Outdoors, Children-music, Cooking, Motorcycle, Spanish, Awards, Figure skating, Holiday music special, Darts, History, Pro wrestling, Soap talk, Playoff sports, Surfing, Children-special, Pool, Beach volleyball, Diving, Skeleton, Mystery, Squash, Water polo, Special, Animated, Canoe, Football, Mixed martial arts, Sports talk, Boxing, Gaming, Sumo wrestling, Agriculture, Boat racing, Golf, Billiards, Historical drama, Collectibles, Medical, Romance-comedy, Weightlifting, Shopping, Curling, Horror, Card games, Community, Beach soccer, Adventure, Comedy-drama, Motorcycle racing, House/garden, Health, War, Polo, Hunting, Military, Educational, Wrestling, Action sports, Field hockey, Ballet, Bicycle, Gaelic football, Handball, Event, Docudrama, Intl soccer, Religious, Holiday-children, Rowing, Auto racing, Biathlon, Running, Skating, Skiing, Indoor soccer, Motorsports, Auction, Musical, Gay/lesbian, Dance, Bullfighting, Crime, Luge, Badminton, Interview, Action, Game show, Olympics, Snowboarding, Baseball, Fishing, Hockey, Lacrosse, Snowmobile, Sports non-event, Triathlon, Music, Aviation, Nature, Science fiction, Aerobics, Swimming, Archery, Hydroplane racing, Soccer, Animals, Dog show, Environment, Science, Fundraiser, Volleyball, Consumer, Dog racing, Fantasy, Bicycle racing, Comedy, Documentary, Art, Anthology, Bobsled, Home improvement, Auto, Western, Tennis, Sports event, Fencing, Children, Holiday special, Parade, Arm wrestling, Poker, Performing arts, Equestrian, Debate, Politics, Children-talk, Horse, Drag racing, Bus./financial, Music talk, Softball, Anime, Yacht racing, Watersports, Music special, Newsmagazine, Suspense, How-to, Reality, News)"
Erie,"List(Reality, Sitcom, News, Talk, Documentary, Comedy-drama, Fashion, Interview, Soccer, Weather, Adventure, Drag racing, Martial arts, Skating, Triathlon, Action sports, Fencing, Gymnastics, Softball, Aerobics, Bus./financial, Gaelic football, Playoff sports, War, Cheerleading, Holiday-children special, Rowing, Dance, Table tennis, Music, Boat racing, Dog show, How-to, Music talk, Mixed martial arts, Motorcycle, Western, Fishing, Wrestling, Agriculture, Olympics, Diving, Racquet, Billiards, Home improvement, Law, Children, Sumo wrestling, Golf, Snowboarding, Equestrian, Awards, Beach soccer, Health, Running, Handball, Romance, Football, Music special, Religious, Sports talk, Arm wrestling, Drama, Historical drama, Boat, Game show, Luge, Anime, Auto racing, Baseball, Anthology, Gay/lesbian, Musical, Politics, Spanish, Collectibles, Parenting, Holiday special, Field hockey, Art, Hydroplane racing, Skeleton, Newsmagazine, Biography, Cricket, Biathlon, Darts, Sports non-event, Animated, Swimming, Bicycle racing, Pool, Tennis, Skateboarding, Romance-comedy, Intl hockey, Mystery, Shooting, Indoor soccer, Canoe, Variety, Polo, Speed skating, Arts/crafts, Basketball, Medical, Crime, Performing arts, Track/field, Animals, Intl soccer, Yacht racing, Aviation, Opera, Community, Dog racing, Entertainment, Soap, Technology, Watersports, Holiday-children, House/garden, Archery, Auto, Fantasy, Nature, Surfing, Auction, Gaming, Action, Lacrosse, Squash, Shopping, Skiing, Volleyball, Parade, Rodeo, Travel, Public affairs, Rugby, Bowling, Card games, Snowmobile, Event, Horror, Debate, Miniseries, Bicycle, Water polo, Musical comedy, Science, Standup, Horse, Poker, History, Sailing, Docudrama, Educational, Exercise, Soap talk, Hockey, Ballet, Suspense, Curling, Children-special, Weightlifting, Science fiction, Motorsports, Kayaking, Beach volleyball, Special, Motorcycle racing, Fundraiser, Hunting, Computers, Cooking, Figure skating, Holiday, Bullfighting, Children-music, Consumer, Mountain biking, Military, Crime drama, Environment, Bobsled, Paranormal, Comedy, Holiday music special, Outdoors, Adults only, Sports event, Water skiing, Theater, Children-talk, Pro wrestling, Boxing, Self improvement, Badminton)"
Eureka,"List(Talk, Documentary, Drama, Biathlon, Educational, Running, Soccer, Tennis, Horse, Debate, Olympics, Snowmobile, Animals, Basketball, Shooting, Beach soccer, Comedy-drama, Pool, Arts/crafts, Squash, Home improvement, Football, Theater, Beach volleyball, Docudrama, Water polo, Standup, Biography, Variety, Holiday, Adults only, Motorcycle racing, Opera, Playoff sports, Gymnastics, Badminton, Gay/lesbian, Intl hockey, Motorcycle, Bicycle, Canoe, Music talk, Darts, Racquet, Self improvement, Art, Holiday-children special, Mountain biking, Holiday special, Newsmagazine, Shopping, Equestrian, Politics, Crime, Bus./financial, Health, Weather, Animated, Dance, Dog racing, Soap, Environment, War, Rugby, Law, Motorsports, Bobsled, Speed skating, How-to, Skating, Soap talk, Sumo wrestling, Crime drama, Polo, Travel, Aerobics, Music, Horror, Hydroplane racing, Sports non-event, Drag racing, Spanish, Parade, Card games, Weightlifting, Auto, Billiards, Sitcom, Cooking, Rodeo, Collectibles, Romance-comedy, Anime, Boxing, Children-music, Dog show, Holiday music special, Baseball, Mystery, Paranormal, Intl soccer, Romance, Interview, Suspense, Table tennis, Softball, Bicycle racing, Children-special, Outdoors, Public affairs, Diving, Children-talk, Yacht racing, Anthology, Figure skating, Surfing, Adventure, Luge, Auction, Action, Auto racing, Cheerleading, Religious, Archery, Gaming, Technology, Wrestling, Bowling, Special, Entertainment, Fencing, Ballet, Snowboarding, Boat, Watersports, Field hockey, Poker, Handball, Volleyball, History, Skiing, Triathlon, Exercise, Pro wrestling, Nature, Community, Lacrosse, Swimming, Martial arts, Performing arts, Fashion, Fishing, Skateboarding, Arm wrestling, Mixed martial arts, Parenting, Historical drama, Hunting, Agriculture, Curling, Hockey, Game show, Medical, Kayaking, Music special, Action sports, Boat racing, Skeleton, Holiday-children, Water skiing, Sports event, Cricket, Musical, Western, Awards, Sailing, Miniseries, Bullfighting, Sports talk, Track/field, Consumer, Fantasy, Rowing, Science fiction, Computers, Musical comedy, Fundraiser, Gaelic football, Military, Golf, Event, Indoor soccer, Science, Aviation, Children, Comedy, House/garden, Reality, News)"
Harrisburg-Lncstr-Leb-York,"List(News, Sports event, Talk, Sitcom, Exercise, Science fiction, Musical, Action sports, Track/field, Cheerleading, Shooting, Soccer, Nature, Boat racing, Skeleton, Gaelic football, Skating, Football, Game show, Golf, Holiday, Holiday-children, Bicycle racing, Performing arts, Surfing, Kayaking, Wrestling, Soap talk, Hydroplane racing, Awards, Drag racing, Art, Motorcycle, Snowboarding, Horse, Sumo wrestling, Olympics, Beach soccer, House/garden, Swimming, Soap, Adventure, Triathlon, Mystery, Children, Anime, Children-special, Debate, Medical, Variety, Aerobics, Educational, Romance-comedy, Cricket, Crime, Music talk, Music, Biography, Drama, Newsmagazine, Softball, Handball, Opera, Bicycle, Fundraiser, Rugby, Crime drama, Equestrian, Mixed martial arts, Weather, Collectibles, Outdoors, Agriculture, Gaming, Beach volleyball, Technology, Military, Special, Travel, Water skiing, Ballet, Law, Politics, Water polo, Music special, Watersports, Skiing, Miniseries, Spanish, Polo, Curling, Intl hockey, Shopping, War, Table tennis, Volleyball, Boxing, Fantasy, Event, Figure skating, Musical comedy, Interview, Environment, Snowmobile, Aviation, Speed skating, Consumer, Holiday-children special, Weightlifting, Yacht racing, Computers, Dance, Dog show, Science, Health, Martial arts, Western, Public affairs, Rodeo, Home improvement, Historical drama, Archery, Bobsled, Field hockey, Skateboarding, Entertainment, Poker, Auto, History, Intl soccer, Motorsports, Auction, How-to, Action, Canoe, Sports talk, Bowling, Diving, Children-music, Horror, Racquet, Self improvement, Fishing, Community, Adults only, Holiday music special, Badminton, Boat, Hunting, Bullfighting, Rowing, Suspense, Fencing, Documentary, Pool, Cooking, Darts, Bus./financial, Anthology, Animated, Tennis, Theater, Pro wrestling, Billiards, Luge, Romance, Paranormal, Sailing, Animals, Biathlon, Motorcycle racing, Children-talk, Arts/crafts, Standup, Parenting, Auto racing, Baseball, Indoor soccer, Basketball, Comedy, Squash, Hockey, Holiday special, Arm wrestling, Mountain biking, Gay/lesbian, Fashion, Sports non-event, Gymnastics, Parade, Card games, Docudrama, Lacrosse, Religious, Comedy-drama, Dog racing, Playoff sports, Running, Reality)"


# question 2.2

In [0]:


from pyspark.sql.functions import col, when
from pyspark.sql.functions import col, when

ref2 = ref_data.select(['device_id','dma', "household_id"]).na.drop().dropDuplicates()

# Filling null values with 0 and converting columns to appropriate data types
demographic_data_2 = selected_demographic_data_df.fillna(0, subset=["income", "net_worth"])
demographic_data_2 = demographic_data_2.withColumn("income", when(col("income") == "A", 10)
                                .when(col("income") == "B", 11)
                                .when(col("income") == "C", 12)
                                .when(col("income") == "D", 13)
                                .otherwise(col("income")))
#casting the income and integer to 
demographic_data_2 = demographic_data_2.withColumn("income", col("income").cast("integer"))
demographic_data_2 = demographic_data_2.withColumn("net_worth", col("net_worth").cast("integer"))

demographic_data_2 = demographic_data_2.withColumn('income', when(col('income').between(1, 13), col('income')).otherwise(0))
demographic_data_2 = demographic_data_2.withColumn('net_worth', when(col('net_worth').between(1, 13), col('net_worth')).otherwise(0))

demographic_data_2 = demographic_data_2.select("household_id", "household_size", "num_adults", "net_worth", "income")




In [0]:
from pyspark.sql.functions import lit,max, avg

dma_with_wealth = ref2.select("dma", "household_id").join(demographic_data_2.select("household_id", "income", "net_worth"), on="household_id", how="inner")
dma_with_wealth = dma_with_wealth.fillna(0, subset=["income", "net_worth"])

# Calculating max net and max income:
max_net = demographic_data_2.agg(max("net_worth")).collect()[0][0]
max_income = demographic_data_2.agg(max("income")).collect()[0][0]

# Calculating wealth_score:
dma_with_wealth2 = dma_with_wealth.groupBy("dma").agg(((avg("net_worth") / max_net) + (avg("income") / max_income)).alias('wealth_score')).orderBy(desc("wealth_score"))





In [0]:
from pyspark.sql.functions import desc, avg, count, lit, col, when, concat_ws, expr, concat
from pyspark.sql import functions as F


# Joining the dma_with_wealth and genre_entry_count_df DataFrames
dma_with_wealth = dma_with_wealth2.join(genre_entry_count_df, on="dma", how="left")

# ordering the columns by wealth score descending order
dma_with_wealth = dma_with_wealth.select("wealth_score", "dma", "genre", "Entry_Count").distinct().orderBy(desc("wealth_score"))

# Creating a list of ordered DMAs by wealth in descending order
ordered_dma_list = dma_with_wealth.select("dma").distinct().orderBy(desc("wealth_score")).collect()
ordered_dma_list = [row["dma"] for row in ordered_dma_list]
genres_list = []
unknown_dmas = [] 

#Checking where the genres are null and creating a list where the dma has those genres
null_dma = dma_with_wealth.filter(col("genre").isNull()).select("dma").collect()
null_dma = [row["dma"] for row in null_dma]
unknown_dmas.extend(null_dma)

# sorting wealth by descending order of score and entry count
wealth_popularity_current = dma_with_wealth.orderBy(desc("wealth_score"), desc("Entry_Count"))
wealth_popularity_current.cache()
current_wealth = None


# # Creating a DataFrame of all DMAs
# all_dma = ref2.select("dma").distinct()



In [0]:
from pyspark.sql.functions import desc, avg, count, lit, col, when, concat_ws, expr, concat
from pyspark.sql import functions as F

# Creating a DataFrame of all DMAs
all_dma = ref2.select("dma").distinct()

i=0
for dma in ordered_dma_list:
    if dma not in null_dma:
        if i != 0: 
            wealth_popularity_current = wealth_popularity_current.filter(col("genre").isin(genres_list) == False)



        # Filtering by dma and take the top 8 entry counts ordering the wealth_popularity_current DataFrame
        current_wealth = wealth_popularity_current.filter(col("dma")== dma).orderBy(desc("Entry_Count")).limit(8)

        genres = current_wealth.select("genre").collect() 
        genres_list = [row["genre"] for row in genres]
      
    if current_wealth is not None and current_wealth.isEmpty():
        unknown_dmas.append(dma)
    
    # Concatenating results if its the first itteration intialize all_dma to be current_wealth else add this to all_dma df
    if current_wealth is not None and (i==0):
        all_dma = current_wealth
    elif current_wealth is not None and (i !=0):
        all_dma = all_dma.union(current_wealth)
 
    # Print the first from the top 25 of DMAs, wealth score, and ordered list of genres
    if i < 25:
        wealth = dma_with_wealth.filter(col("dma") == dma).select("wealth_score").first()[0]          
        if dma in null_dma or current_wealth.isEmpty():
            print(f"({dma}, {wealth}, [])\n")  
        else:
            print(f"({dma}, {wealth}, {genres_list})\n")
 
    i=i+1


# Creating the DataFrame with missing DMAs and empty attributes
missing_dma_df = spark.createDataFrame([(dma,) for dma in unknown_dmas], ["dma"])
missing_dma_df = missing_dma_df.join(dma_with_wealth.select("dma", "wealth_score"), on="dma", how="inner").distinct()
missing_dma_df = missing_dma_df.withColumn("genre", F.lit("[]")).select("dma", "wealth_score", "genre")
 
# Transforming the all_dma data frame before union with missing_dma_df
res_dma_wealth = all_dma.orderBy(desc("wealth_score"), desc("Entry_Count"))
res_dma_wealth = res_dma_wealth.groupBy("dma").agg(F.collect_list("genre").alias("genre"))
res_dma_wealth = res_dma_wealth.join(dma_with_wealth.select("dma", "wealth_score"), on="dma", how="inner").distinct()
res_dma_wealth = res_dma_wealth.withColumn("genre", F.expr("concat('[', concat_ws(', ', genre), ']')"))
res_dma_wealth = res_dma_wealth.select("dma", "wealth_score", "genre")

# Unite the missing dmas genres with the dmas that have genres to a single data frame
res_dma_wealth = res_dma_wealth.union(missing_dma_df)

# Displaying the final DataFrame with dma, wealth_score, and genre ordered by wealth_score
display(res_dma_wealth.select("dma", "wealth_score", "genre").orderBy(desc("wealth_score")))


(San Antonio, 1.623931623931624, [])

(Baltimore, 1.3484309314846228, ['Figure skating', 'Motorsports', 'Intl soccer', 'Shooting', 'Performing arts', 'Archery', 'Dog show', 'Drag racing'])

(San Francisco-Oak-San Jose, 1.3357808419815185, ['Wrestling', 'Watersports', 'Weightlifting', 'Yacht racing', 'Fundraiser', 'Surfing', 'Rugby', 'Boat'])

(Detroit, 1.305826181524095, ['Bowling', 'Swimming', 'Diving', 'Running', 'Arm wrestling', 'Billiards', 'Computers', 'Snowboarding'])

(Austin, 1.2722546588818684, ['Skateboarding', 'Beach volleyball', 'Snowmobile', 'Olympics', 'Sailing', 'Triathlon', 'Pool', 'Music special'])

(Sacramnto-Stkton-Modesto, 1.2359677521362542, ['Water polo', 'Holiday', 'Darts', 'Theater', 'Curling', 'Holiday special', 'Kayaking', 'Rowing'])

(Cleveland-Akron (Canton), 1.211882525608016, ['Boat racing', 'Field hockey', 'Speed skating', 'Beach soccer', 'Cricket', 'Racquet', 'Skeleton', 'Bobsled'])

(Harrisburg-Lncstr-Leb-York, 1.2019055299195078, ['Holiday music specia

dma,wealth_score,genre
San Antonio,1.623931623931624,[]
Baltimore,1.3484309314846228,"[Figure skating, Motorsports, Intl soccer, Shooting, Performing arts, Archery, Dog show, Drag racing]"
San Francisco-Oak-San Jose,1.3357808419815185,"[Wrestling, Watersports, Weightlifting, Yacht racing, Fundraiser, Surfing, Rugby, Boat]"
Detroit,1.305826181524095,"[Bowling, Swimming, Diving, Running, Arm wrestling, Billiards, Computers, Snowboarding]"
Austin,1.2722546588818684,"[Skateboarding, Beach volleyball, Snowmobile, Olympics, Sailing, Triathlon, Pool, Music special]"
Sacramnto-Stkton-Modesto,1.2359677521362542,"[Water polo, Holiday, Darts, Theater, Curling, Holiday special, Kayaking, Rowing]"
Cleveland-Akron (Canton),1.211882525608016,"[Boat racing, Field hockey, Speed skating, Beach soccer, Cricket, Racquet, Skeleton, Bobsled]"
Harrisburg-Lncstr-Leb-York,1.2019055299195078,"[Holiday music special, Luge, Squash, Dog racing, Mountain biking, Holiday-children special, Polo, Skating]"
Toledo,1.199510473429358,"[Handball, Fencing, Aerobics]"
Philadelphia,1.1948499023773462,"[Table tennis, Bicycle]"


#part 3

In [0]:
import os
from pyspark.sql.functions import col

# Extracting all genres from dma
dma_genre = data_join_ref.select("dma", "genre").distinct()
#create list of disitnct dma
dma_list=data_join_ref.select("dma").distinct()

#Creating a path to all folders to be kept in
all_folders_path = "/Users/yuvalmar@campus.technion.ac.il//part3"
# Creating folders for every DMA 
for dma in dma_list:
    
    folder_path = f"{all_folders_path}/{dma}"
    dbutils.fs.mkdirs(folder_path)

    # Extracting all relevant genres thats in the DMA
    genres_list = dma_genre.filter(dma_genre["dma"] == dma ).select("genre").collect()
    genres_list = [row["genre"] for row in genres_list]

    for genre in genres_list:
        #filter the progs of this genre if it exists in daily_prog_data
        filtered_prog_by_genre = daily_prog_data.filter(col("genre").contains(genre)) 
         
        # create table with the name genres_progs_table
        genres_progs_table = f"{genre}_data".replace(" ", "_").replace("-", "_").lower()
            
        # parition the DMA by genre and create a csv path and write into it the file of genre and it programs
        csv_path = os.path.join(folder_path, genres_progs_table)
        filtered_prog_by_genre.write.partitionBy("genre").csv(csv_path, header=True, mode="append")
