In [1]:
import altair as alt
import pandas as pd
import numpy as np
import timeit
from IPython.display import Image, display

alt.data_transformers.enable("vegafusion")

DataTransformerRegistry.enable('vegafusion')

In [2]:
tvseries_fname = "TMDB_tv_dataset_v3.csv"

In [3]:
# Load data
tvseries_df = pd.read_csv("data/"+tvseries_fname)


In [4]:
# TV Series - Attributes
print("Rows: ", len(tvseries_df))
print("Columns: ", len(tvseries_df.columns))
print(list(tvseries_df.columns))
print("-----------")
# print(tvseries_df["episode_run_time"].unique())

Rows:  166383
Columns:  29
['id', 'name', 'number_of_seasons', 'number_of_episodes', 'original_language', 'vote_count', 'vote_average', 'overview', 'adult', 'backdrop_path', 'first_air_date', 'last_air_date', 'homepage', 'in_production', 'original_name', 'popularity', 'poster_path', 'type', 'status', 'tagline', 'genres', 'created_by', 'languages', 'networks', 'origin_country', 'spoken_languages', 'production_companies', 'production_countries', 'episode_run_time']
-----------


In [5]:
# TV Series - Attributes
# ---------------------------------------------------
# id - Integer, quantitative???
# name - String, categorical 
# number_of_seasons - integer, quantitative
# number_of_episodes - integer, quantitative
# original_language - String, categorical - iso language code
# vote_count - Integer, quantitative, sequential
# vote_average - Double, quantitative, sequential
# overview - String, categorical - movie summary, kinda useless - useless
# adult - boolean, categorical
# backdrop_path - String, categorical - should be removed cause it store image path - useless
# first_air_date - Date, sequential
# last_air_date - Date, sequential
# homepage - String, categorical - should be removed cause it refers to the movie's website - useless
# in_production - Boolean, - not sure what it could bring to table
# original_name - String, categorical - not sure how useful
# popularity - Double, Sequential - not sure what the value represent
# poster_path - String, categorical - remove this, poster image - useless
# type - String, categorical
# status - String, categorical
# tagline - String, categorical - not sure this field could be meaningful - useless
# genres - List<String>, categorical - contains multiple value, candidate for connection/graph
# created_by - String, quantitative
# languages - List<String>, categorical - contains multiple value, not sure how good this is
# networks - List<String>, categorical
# origin_country - String, categorical
# spoken_languages - List<String>, categorical - contains multiple value, not sure how good this is - useless
# production_companies - List<String>, categorical - contains multiple value, candidate for connection/graph
# production_countries - List<String>, categorical - contains multiple value, candidate for connection/graph
# episode_run_time - Integer, sequential - duration in minutes per episode, some are zero

In [6]:
tvseries_df.isna().sum()

id                           0
name                         5
number_of_seasons            0
number_of_episodes           0
original_language            0
vote_count                   0
vote_average                 0
overview                 73834
adult                        0
backdrop_path            89285
first_air_date           31478
last_air_date            29646
homepage                115768
in_production                0
original_name                5
popularity                   0
poster_path              59099
type                         0
status                       0
tagline                 161074
genres                   67628
created_by              130225
languages                57415
networks                 69546
origin_country           31025
spoken_languages         58174
production_companies    107630
production_countries     89679
episode_run_time             0
dtype: int64

In [7]:
tvseries_df_columns = tvseries_df.columns
for column in tvseries_df_columns:
    print(column)
    print(tvseries_df[column].unique())
    print()
    print()

id
[  1399  71446  66732 ... 239216 239217 239218]


name
['Game of Thrones' 'Money Heist' 'Stranger Things' ... 'Parenting'
 'BPM Ecstasy' 'Génération 2008 : qui seront-ils demain ?']


number_of_seasons
[  8   3   4  11   6   7   2   5   1   9  12  19  35  20  10  15  21  22
  26  16  24  14  13  25  30  33  27  17  29  49  45  31  23  18  53  39
   0  72  51  32  37  40  36  54  50  64  34  38  41  71  28  56  66  47
  42  60 240  68  59 188  43  67  52  74  46  80  73  61  57  44  63  75
  84  48  58  62  55  70 134 101  95 111]


number_of_episodes
[   73    41    34   177    93   137     9    62   116   184   279    12
   419    24   762    16    32    36    49    71   171     6   500   100
     8   228   327    89    26    50    55   170    87     5   130   220
   441    96   131   113    88   138   134    27   208    63   129    33
    45   291  1080    39   126   151   411   321     7   216   118    37
    61   186   334    10   538   136    20   335    92    52   246   152
  

In [8]:
# ------------------------
# TV Series

tvseries_df = tvseries_df.drop(['backdrop_path', 'homepage', 'poster_path', 'overview', 'tagline'], axis=1)
tvseries_df = tvseries_df.dropna(subset=tvseries_df.columns.difference(['last_air_date']))

# get year later than 1990
tvseries_df['first_air_date'] = pd.to_datetime(tvseries_df['first_air_date'])
tvseries_df['last_air_date'] = pd.to_datetime(tvseries_df['last_air_date'])
tvseries_df = tvseries_df[tvseries_df['first_air_date'].dt.year >= 1990]


tvseries_df.loc[tvseries_df["origin_country"] == "XC", "origin_country"] = "CZ"

for ccode in ["YU", "XG", "XK", "SU"]:
    tvseries_df.drop(tvseries_df[tvseries_df.origin_country == ccode].index, inplace=True)
# tvseries_df = tvseries_df.dropna(subset=tvseries_df.columns.difference(['last_air_date']), how='all')
# tvseries_df = tvseries_df.dropna()

In [9]:
# tvseries_df.loc[tvseries_df["origin_country"] == "XC", "origin_country"] = "CZ"
tvseries_df["origin_country"].isin(["YU", "XG", "XK", "SU"]) == True

0         False
1         False
2         False
3         False
4         False
          ...  
166307    False
166319    False
166329    False
166360    False
166361    False
Name: origin_country, Length: 15186, dtype: bool

In [10]:
cleaned_file_name = "data/cleaned_TMDB_tv_dataset.csv"
tvseries_df.to_csv(cleaned_file_name, date_format='%Y-%m-%d')

cleaned_df = pd.read_csv(cleaned_file_name)
print(len(cleaned_df))


15186


In [11]:
tvseries_df = cleaned_df
print("After clean up")
print("Rows: ", len(tvseries_df))
print("Columns: ", len(tvseries_df.columns))

After clean up
Rows:  15186
Columns:  25


In [12]:
tvseries_df.isna().sum()
tvseries_df['first_air_date'] = pd.to_datetime(tvseries_df['first_air_date'], format='%Y-%m-%d')
tvseries_df[tvseries_df['first_air_date'].dt.year >= 1990]
# tvseries_df

Unnamed: 0.1,Unnamed: 0,id,name,number_of_seasons,number_of_episodes,original_language,vote_count,vote_average,adult,first_air_date,...,status,genres,created_by,languages,networks,origin_country,spoken_languages,production_companies,production_countries,episode_run_time
0,0,1399,Game of Thrones,8,73,en,21857,8.442,False,2011-04-17,...,Ended,"Sci-Fi & Fantasy, Drama, Action & Adventure","David Benioff, D.B. Weiss",en,HBO,US,English,"Revolution Sun Studios, Television 360, Genera...","United Kingdom, United States of America",0
1,1,71446,Money Heist,3,41,es,17836,8.257,False,2017-05-02,...,Ended,"Crime, Drama",Álex Pina,es,"Netflix, Antena 3",ES,Español,Vancouver Media,Spain,70
2,2,66732,Stranger Things,4,34,en,16161,8.624,False,2016-07-15,...,Returning Series,"Drama, Sci-Fi & Fantasy, Mystery","Matt Duffer, Ross Duffer",en,Netflix,US,English,"21 Laps Entertainment, Monkey Massacre Product...",United States of America,0
3,3,1402,The Walking Dead,11,177,en,15432,8.121,False,2010-10-31,...,Ended,"Action & Adventure, Drama, Sci-Fi & Fantasy",Frank Darabont,en,AMC,US,English,"AMC Studios, Circle of Confusion, Valhalla Mot...",United States of America,42
4,4,63174,Lucifer,6,93,en,13870,8.486,False,2016-01-25,...,Ended,"Crime, Sci-Fi & Fantasy",Tom Kapinos,en,"FOX, Netflix",US,English,"Warner Bros. Television, DC Entertainment, Jer...",United States of America,45
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15181,166307,237992,Frozen in Time: Flashback,1,8,en,0,0.000,False,2021-02-20,...,Ended,"Documentary, Reality, Family",Cindy Bertram,en,"HGTV, discovery+",US,English,Glass Entertainment Group,United States of America,0
15182,166319,238974,Valdes Jul,1,1,da,0,0.000,False,2023-12-01,...,Returning Series,"Kids, Family","Jesper Fink, Thomas Porsager",da,TV 2,DK,Dansk,Cosmo Film,Denmark,0
15183,166329,238688,Doraleous and Associates,1,50,en,0,0.000,False,2012-08-13,...,Ended,"Animation, Comedy, Sci-Fi & Fantasy",Hank and Jed,en,YouTube,US,English,Hank and Jed,United States of America,333
15184,166360,239216,BPM Ecstasy,1,4,fr,0,0.000,False,2023-11-14,...,Ended,Documentary,Olivier Richard,fr,"Arte , Taiwan+",FR,Français,"ARTE, Zorba Production, Volos Films, TaiwanPlus","France, Taiwan",0


In [17]:
alt.Chart(tvseries_df).mark_bar().encode(
    x='year(first_air_date):O',
    y='count()',
    tooltip='count()',
).interactive()

In [21]:
print(len(tvseries_df))
include = tvseries_df[tvseries_df['first_air_date'].dt.year >= 1990]
print(len(include))

16751
15191


In [14]:
# Explore trends in TV show popularity based on vote count and average.
alt.Chart(tvseries_df.head(5000)).mark_circle(size=60).encode(
    x='vote_average',
    y='vote_count',
    # color='Origin',
    tooltip=['name', 'vote_count', 'vote_average']
).interactive()
# .properties(
#     width=1000,
#     height=600
# ).interactive()

In [15]:
# Identify the most prolific TV show creators or production companies based on the number of shows they have created.
# print(tvseries_df["created_by"])
# print(tvseries_df.dtypes)
alt.Chart(tvseries_df.head(100)).mark_bar().encode(
    x='created_by',
    y='count()'
).interactive()
# .transform_filter(
    # datum.year == 2000
# )


# alt.Chart().mark_bar().encode(
#     y='created_by',
#     x='number_of_seasons'
# )

In [16]:
tvseries_df['genre'] = tvseries_df['genres'].str.split(",")
tvseries_df2 = tvseries_df.explode('genre')
# print(tvseries_df2.head(5))

In [17]:
tvseries_df2.head(5)

Unnamed: 0.1,Unnamed: 0,id,name,number_of_seasons,number_of_episodes,original_language,vote_count,vote_average,adult,first_air_date,...,genres,created_by,languages,networks,origin_country,spoken_languages,production_companies,production_countries,episode_run_time,genre
0,0,1399,Game of Thrones,8,73,en,21857,8.442,False,2011-04-17,...,"Sci-Fi & Fantasy, Drama, Action & Adventure","David Benioff, D.B. Weiss",en,HBO,US,English,"Revolution Sun Studios, Television 360, Genera...","United Kingdom, United States of America",0,Sci-Fi & Fantasy
0,0,1399,Game of Thrones,8,73,en,21857,8.442,False,2011-04-17,...,"Sci-Fi & Fantasy, Drama, Action & Adventure","David Benioff, D.B. Weiss",en,HBO,US,English,"Revolution Sun Studios, Television 360, Genera...","United Kingdom, United States of America",0,Drama
0,0,1399,Game of Thrones,8,73,en,21857,8.442,False,2011-04-17,...,"Sci-Fi & Fantasy, Drama, Action & Adventure","David Benioff, D.B. Weiss",en,HBO,US,English,"Revolution Sun Studios, Television 360, Genera...","United Kingdom, United States of America",0,Action & Adventure
1,1,71446,Money Heist,3,41,es,17836,8.257,False,2017-05-02,...,"Crime, Drama",Álex Pina,es,"Netflix, Antena 3",ES,Español,Vancouver Media,Spain,70,Crime
1,1,71446,Money Heist,3,41,es,17836,8.257,False,2017-05-02,...,"Crime, Drama",Álex Pina,es,"Netflix, Antena 3",ES,Español,Vancouver Media,Spain,70,Drama


In [18]:
alt.Chart(tvseries_df2.head(5)).mark_bar().encode(
    x='genre',
    y='count()'
).interactive()

In [19]:
from vega_datasets import data
pop = data.population_engineers_hurricanes()
pop.head()
# pop[['state', 'id']].to_dict()
# dict(zip(pop.state, pop.id))

Unnamed: 0,state,id,population,engineers,hurricanes
0,Alabama,1,4863300,0.003422,22
1,Alaska,2,741894,0.001591,0
2,Arizona,4,6931071,0.004774,0
3,Arkansas,5,2988248,0.00244,0
4,California,6,39250017,0.007126,0


In [24]:
states = alt.topo_feature(data.us_10m.url, 'states')
print(states.to_json())


alt.Chart(states).mark_geoshape().encode(
    color='population:Q'
).transform_lookup(
    lookup='id',
    from_=alt.LookupData(pop, 'id', list(pop.columns))
).properties(
    width=500,
    height=300
).project(
    type='albersUsa'
)

{
  "format": {
    "feature": "states",
    "type": "topojson"
  },
  "url": "https://cdn.jsdelivr.net/npm/vega-datasets@v1.29.0/data/us-10m.json"
}


In [25]:
map_states_id = {'Alabama': 1,
 'Alaska': 2,
 'Arizona': 4,
 'Arkansas': 5,
 'California': 6,
 'Colorado': 8,
 'Connecticut': 9,
 'Delaware': 10,
 'District of Columbia': 11,
 'Florida': 12,
 'Georgia': 13,
 'Hawaii': 15,
 'Idaho': 16,
 'Illinois': 17,
 'Indiana': 18,
 'Iowa': 19,
 'Kansas': 20,
 'Kentucky': 21,
 'Louisiana': 22,
 'Maine': 23,
 'Maryland': 24,
 'Massachusetts': 25,
 'Michigan': 26,
 'Minnesota': 27,
 'Mississippi': 28,
 'Missouri': 29,
 'Montana': 30,
 'Nebraska': 31,
 'Nevada': 32,
 'New Hampshire': 33,
 'New Jersey': 34,
 'New Mexico': 35,
 'New York': 36,
 'North Carolina': 37,
 'North Dakota': 38,
 'Ohio': 39,
 'Oklahoma': 40,
 'Oregon': 41,
 'Pennsylvania': 42,
 'Rhodes Island': 44,
 'South Carolina': 45,
 'South Dakota': 46,
 'Tennessee': 47,
 'Texas': 48,
 'Utah': 49,
 'Vermont': 50,
 'Virginia': 51,
 'Washington': 53,
 'West Virginia': 54,
 'Wisconsin': 55,
 'Wyoming': 56,
 'Puerto Rico': 72}

homicides_df["State ID"] = homicides_df["State"].map(map_states_id)
homicides_df = homicides_df.astype({"State ID": int})

In [26]:
homicides_df.head()
# print(homicides_df["State ID"].unique())

Unnamed: 0,Record ID,Agency Code,Agency Name,Agency Type,City,State,Year,Month,Incident,Crime Type,...,Perpetrator Sex,Perpetrator Age,Perpetrator Race,Perpetrator Ethnicity,Relationship,Weapon,Victim Count,Perpetrator Count,Record Source,State ID
0,1,AK00101,Anchorage,Municipal Police,Anchorage,Alaska,1980,January,1,Murder or Manslaughter,...,Male,15,Native American/Alaska Native,Unknown,Acquaintance,Blunt Object,0,0,FBI,2
1,2,AK00101,Anchorage,Municipal Police,Anchorage,Alaska,1980,March,1,Murder or Manslaughter,...,Male,42,White,Unknown,Acquaintance,Strangulation,0,0,FBI,2
2,3,AK00101,Anchorage,Municipal Police,Anchorage,Alaska,1980,March,2,Murder or Manslaughter,...,Unknown,0,Unknown,Unknown,Unknown,Unknown,0,0,FBI,2
3,4,AK00101,Anchorage,Municipal Police,Anchorage,Alaska,1980,April,1,Murder or Manslaughter,...,Male,42,White,Unknown,Acquaintance,Strangulation,0,0,FBI,2
4,5,AK00101,Anchorage,Municipal Police,Anchorage,Alaska,1980,April,2,Murder or Manslaughter,...,Unknown,0,Unknown,Unknown,Unknown,Unknown,0,1,FBI,2


In [71]:
homocides_sum_by_state_df = homicides_df.groupby(['State ID', 'State'], as_index=False).sum('Incident')
homocides_sum_by_state_df.head()

data['subjects'].value_counts()['php']
# print(homocides_sum_by_state_df.columns)

Unnamed: 0,State ID,State,Record ID,Year,Incident,Victim Age,Perpetrator Age,Victim Count,Perpetrator Count
0,1,Alabama,3107984831,22680685,38149,494547,267515,512,738
1,2,Alaska,500600479,3227204,2704,55379,40097,380,213
2,4,Arizona,4657760650,25722653,61337,465003,258285,1698,1908
3,5,Arkansas,2186342405,13866208,12898,276512,182422,1068,1484
4,6,California,30328411741,199099051,1011666,3233503,1828125,11450,19198


In [74]:
# print(homicides_df.columns)
# print(homocides_sum_by_state_df["State"].unique())

alt.Chart(states).mark_geoshape().encode(
    tooltip=['State:N', 'Incident:Q'],
    color='Incident:Q'
).transform_lookup(
    lookup='id',
    from_=alt.LookupData(homocides_sum_by_state_df, 'State ID', list(homocides_sum_by_state_df.columns))
).properties(
    width=500,
    height=300
).project(
    type='albersUsa'
).interactive()

In [2]:
import pandas as pd
cleaned_file_name = "data/cleaned_TMDB_tv_dataset.csv"
# tvseries_df.to_csv(cleaned_file_name, date_format='%Y-%m-%d')

cleaned_df = pd.read_csv(cleaned_file_name)
print(len(cleaned_df))

15186


In [3]:
cleaned_df.id == 56094

0        False
1        False
2        False
3        False
4        False
         ...  
15181    False
15182    False
15183    False
15184    False
15185    False
Name: id, Length: 15186, dtype: bool

In [4]:
cleaned_df = cleaned_df.drop(cleaned_df[cleaned_df.id == 56094].index)

In [5]:
cleaned_df[cleaned_df.id == 56094]

Unnamed: 0.1,Unnamed: 0,id,name,number_of_seasons,number_of_episodes,original_language,vote_count,vote_average,adult,first_air_date,...,status,genres,created_by,languages,networks,origin_country,spoken_languages,production_companies,production_countries,episode_run_time


In [34]:
cleaned_df.to_csv(cleaned_file_name, date_format='%Y-%m-%d')

NameError: name 'cleaned_df' is not defined

In [41]:
movies_df = pd.read_csv("data/movie_dataset_backup.csv")

In [42]:
movies_df.columns

Index(['id', 'title', 'vote_average', 'vote_count', 'release_date', 'revenue',
       'runtime', 'adult', 'budget', 'original_language', 'popularity',
       'genres', 'production_companies', 'production_countries',
       'spoken_languages', 'iso_countries', 'oscar'],
      dtype='object')

In [43]:
len(movies_df)

12469

In [44]:
movies_df = movies_df[movies_df['vote_count']>=50]
len(movies_df)

8749

In [45]:
movies_df = movies_df[movies_df['runtime']>=30]
movies_df['release_date'] = pd.to_datetime(movies_df['release_date'])
movies_df = movies_df.sort_values(['release_date'], ascending=[False])
len(movies_df)

8747

In [46]:
cleaned_file_name = "data/movie_dataset.csv"
movies_df.to_csv(cleaned_file_name, date_format='%Y-%m-%d', na_rep='None', index=False)