# Basic Data Insights Using the Netflix Dataset

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = {
    'A': [1, 2, 3, 4, 5],
    'B': ['a', 'b', 'c', 'd', 'e'],
    'C': [True, False, True, False, True]
}
df = pd.DataFrame(data)
df

Unnamed: 0,A,B,C
0,1,a,True
1,2,b,False
2,3,c,True
3,4,d,False
4,5,e,True


In [3]:
count_result = df.count()
print("Count of non-null values in each column:")
print(count_result)
print()

Count of non-null values in each column:
A    5
B    5
C    5
dtype: int64



In [4]:
s = pd.Series(['apple', 'banana', 'apple', 'orange', 'banana'])
# Count unique values in the Series
count_values_result = s.value_counts()
print("Count of unique values in the Series:")
print(count_values_result)
print()

Count of unique values in the Series:
apple     2
banana    2
orange    1
Name: count, dtype: int64



In [5]:
# Date and Time example

date = "2024-10-31"

# Convert string to datetime
date_time = pd.to_datetime(date)
print("Converted date and time:")
print(date_time)

# Extracting date components
year = date_time.year
month = date_time.month
day = date_time.day
print(f"Year: {year}, Month: {month}, Day: {day}")

# Formatting date
formatted_date = date_time.strftime("%d-%m-%Y")
print("Formatted date:")
print(formatted_date)


# Extracting time components
hour = date_time.hour
minute = date_time.minute
second = date_time.second
print(f"Hour: {hour}, Minute: {minute}, Second: {second}")

Converted date and time:
2024-10-31 00:00:00
Year: 2024, Month: 10, Day: 31
Formatted date:
31-10-2024
Hour: 0, Minute: 0, Second: 0


In [6]:
# dt.date example

# Create a Series of datetime objects
date_series = pd.Series(pd.to_datetime(['2024-10-31', '2024-11-01', '2024-11-02']))
# Extract date component
date_component = date_series.dt.date
print("Extracted date component:")
print(date_component)
# Extracting year, month, and day
year_component = date_series.dt.year
month_component = date_series.dt.month
day_component = date_series.dt.day
print(f"Year component:\n{year_component}")
print(f"Month component:\n{month_component}")
print(f"Day component:\n{day_component}")

Extracted date component:
0    2024-10-31
1    2024-11-01
2    2024-11-02
dtype: object
Year component:
0    2024
1    2024
2    2024
dtype: int32
Month component:
0    10
1    11
2    11
dtype: int32
Day component:
0    31
1     1
2     2
dtype: int32


In [7]:
# Date time series
date_time_series = pd.Series([
    '2024-10-31 10:00:00',
    '2024-11-01 15:30:00',
    '2024-11-02 20:45:00',
    '2024-11-03 08:15:00',
    '2024-11-04 12:00:00'
])

# get hour 
hour_series = pd.to_datetime(date_time_series).dt.hour
print("Hour series:")
print(hour_series)

Hour series:
0    10
1    15
2    20
3     8
4    12
dtype: int32


In [8]:
# Sample DataFrame
data = {
    'person': ['Ravi', 'Priya', 'Amit', 'Neha', 'Suresh'],
    'cities_visited': [
        'Delhi, Mumbai, Bangalore',
        'Chennai, Hyderabad',
        'Ahmedabad, Jaipur, Surat, Pune',
        'Kolkata',
        'Bhopal, Indore'
    ]
}

In [9]:
df = pd.DataFrame(data)
df

Unnamed: 0,person,cities_visited
0,Ravi,"Delhi, Mumbai, Bangalore"
1,Priya,"Chennai, Hyderabad"
2,Amit,"Ahmedabad, Jaipur, Surat, Pune"
3,Neha,Kolkata
4,Suresh,"Bhopal, Indore"


In [10]:
df['cities_list'] = df['cities_visited'].str.split(', ')
print("DataFrame after splitting cities_visited:")
df

DataFrame after splitting cities_visited:


Unnamed: 0,person,cities_visited,cities_list
0,Ravi,"Delhi, Mumbai, Bangalore","[Delhi, Mumbai, Bangalore]"
1,Priya,"Chennai, Hyderabad","[Chennai, Hyderabad]"
2,Amit,"Ahmedabad, Jaipur, Surat, Pune","[Ahmedabad, Jaipur, Surat, Pune]"
3,Neha,Kolkata,[Kolkata]
4,Suresh,"Bhopal, Indore","[Bhopal, Indore]"


In [11]:
# Explode the list into separate rows
df_exploded = df.explode('cities_list')

# Optional: Clean up the DataFrame for presentation
df_exploded = df_exploded[['person', 'cities_list']].rename(columns={'cities_list': 'city_visited'})

print("\n Final DataFrame after .explode():")
print(df_exploded.reset_index(drop=True))


 Final DataFrame after .explode():
    person city_visited
0     Ravi        Delhi
1     Ravi       Mumbai
2     Ravi    Bangalore
3    Priya      Chennai
4    Priya    Hyderabad
5     Amit    Ahmedabad
6     Amit       Jaipur
7     Amit        Surat
8     Amit         Pune
9     Neha      Kolkata
10  Suresh       Bhopal
11  Suresh       Indore


In [12]:
# Find all people who visited 'Mumbai'
visited_mumbai = df_exploded[df_exploded['city_visited'] == 'Mumbai']
print("\nPeople who visited Mumbai:")
print(visited_mumbai)


People who visited Mumbai:
  person city_visited
0   Ravi       Mumbai


In [13]:
import pandas as pd

# Simple DataFrame
df = pd.DataFrame({
    'name': ['Anil', 'Sunita', 'Raj'],
    'fruits': ['Apple, Mango', 'Banana, Orange, Guava', 'Pineapple']
})

# One-liner to split and explode
df_exploded = df.assign(fruits=df['fruits'].str.split(', ')).explode('fruits')

# Display result
print(df_exploded)

     name     fruits
0    Anil      Apple
0    Anil      Mango
1  Sunita     Banana
1  Sunita     Orange
1  Sunita      Guava
2     Raj  Pineapple


In [14]:
import pandas as pd

data = {
    'Region': ['East', 'West', 'East', 'South', 'West', 'South', 'East', 'North', 'North', 'South', 'East', 'West', 'South', 'North', 'East'],
    'Salesperson': ['Alice', 'Bob', 'Alice', 'Charlie', 'Bob', 'Charlie', 'Alice', 'David', 'Eve', 'Eve', 'Alice', 'Bob', 'Charlie', 'David', 'Alice'],
    'Sales': [200, 150, 340, 300, 120, 330, 220, 400, 250, 310, 180, 140, 320, 410, 230],
    'Units': [5, 3, 8, 6, 2, 7, 4, 10, 6, 9, 5, 3, 8, 11, 4]
}

df = pd.DataFrame(data)
print(df)

   Region Salesperson  Sales  Units
0    East       Alice    200      5
1    West         Bob    150      3
2    East       Alice    340      8
3   South     Charlie    300      6
4    West         Bob    120      2
5   South     Charlie    330      7
6    East       Alice    220      4
7   North       David    400     10
8   North         Eve    250      6
9   South         Eve    310      9
10   East       Alice    180      5
11   West         Bob    140      3
12  South     Charlie    320      8
13  North       David    410     11
14   East       Alice    230      4


In [15]:
region_sales = df.groupby('Region')['Sales'].sum()
print(region_sales)

Region
East     1170
North    1060
South    1260
West      410
Name: Sales, dtype: int64


In [16]:
agg_result = df.groupby('Region').agg({
    'Sales': ['sum', 'mean', 'max'],
    'Units': ['sum', 'min']
})
print(agg_result)

       Sales                  Units    
         sum        mean  max   sum min
Region                                 
East    1170  234.000000  340    26   4
North   1060  353.333333  410    27   6
South   1260  315.000000  330    30   6
West     410  136.666667  150     8   2


In [17]:
df['Total_Sales'] = df['Sales'] * df['Units']
df

Unnamed: 0,Region,Salesperson,Sales,Units,Total_Sales
0,East,Alice,200,5,1000
1,West,Bob,150,3,450
2,East,Alice,340,8,2720
3,South,Charlie,300,6,1800
4,West,Bob,120,2,240
5,South,Charlie,330,7,2310
6,East,Alice,220,4,880
7,North,David,400,10,4000
8,North,Eve,250,6,1500
9,South,Eve,310,9,2790


In [18]:
regional_total_sales = df.groupby('Region')['Total_Sales'].sum()
print(regional_total_sales)

Region
East      6420
North    10010
South     9460
West      1110
Name: Total_Sales, dtype: int64


In [19]:
result = df.groupby('Region')[['Sales', 'Units']].sum()
print(result)

        Sales  Units
Region              
East     1170     26
North    1060     27
South    1260     30
West      410      8


In [20]:
agg_result = df.groupby('Region').agg({
    'Sales': ['sum', 'mean', 'max'],
    'Units': ['sum', 'min']
})
print(agg_result)

       Sales                  Units    
         sum        mean  max   sum min
Region                                 
East    1170  234.000000  340    26   4
North   1060  353.333333  410    27   6
South   1260  315.000000  330    30   6
West     410  136.666667  150     8   2


In [21]:
grouped = df.groupby(['Region', 'Salesperson'])['Sales'].sum()
print(grouped)

Region  Salesperson
East    Alice          1170
North   David           810
        Eve             250
South   Charlie         950
        Eve             310
West    Bob             410
Name: Sales, dtype: int64


# Basic Data Insights Using the Netflix Dataset
- How many titles are there per content type (Movie vs TV Show)?
- What is the number of titles released each year?
- Which countries produce the most content on Netflix?
- What is the average number of titles per rating?
- How many titles were added to Netflix each month or year (based on date_added)?
- Which directors have the most titles on Netflix?
- What are the top 5 most common genres or categories (from the listed_in column)?

In [22]:
df = pd.read_csv('netflix_titles.csv')
# df.columns.to_list()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


In [23]:
df.shape

(8807, 12)

In [24]:
# How many titles are there per content type (Movie vs TV Show)?
group_content = df.groupby('type')['title'].count()
group_content

type
Movie      6131
TV Show    2676
Name: title, dtype: int64

In [25]:
# What is the number of titles released each year?
group_year = df.groupby('release_year')['title'].count()
group_year.head(10)

release_year
1925    1
1942    2
1943    3
1944    3
1945    4
1946    2
1947    1
1954    2
1955    3
1956    2
Name: title, dtype: int64

In [26]:
# Which countries produce the most content on Netflix?
group_contries = df.groupby('country')['title'].count()
group_contries

country
, France, Algeria                                       1
, South Korea                                           1
Argentina                                              56
Argentina, Brazil, France, Poland, Germany, Denmark     1
Argentina, Chile                                        2
                                                       ..
Venezuela                                               1
Venezuela, Colombia                                     1
Vietnam                                                 7
West Germany                                            1
Zimbabwe                                                1
Name: title, Length: 748, dtype: int64

In [27]:
# What is the average number of titles per rating?
group_rating = df.groupby('rating')['title'].count()
group_rating

rating
66 min         1
74 min         1
84 min         1
G             41
NC-17          3
NR            80
PG           287
PG-13        490
R            799
TV-14       2160
TV-G         220
TV-MA       3207
TV-PG        863
TV-Y         307
TV-Y7        334
TV-Y7-FV       6
UR             3
Name: title, dtype: int64

In [28]:
# Which directors have the most titles on Netflix?
group_director = df.groupby('director')['title'].count()
group_director = group_director.reset_index()
group_director.columns = ['director', 'title_count']
group_director = group_director.sort_values(by='title_count', ascending=False)
group_director.head(10)

Unnamed: 0,director,title_count
3392,Rajiv Chilaka,19
3443,"Raúl Campos, Jan Suter",18
4046,Suhas Kadav,16
2598,Marcus Raboy,16
1790,Jay Karas,14
685,Cathy Garcia-Molina,13
4480,Youssef Chahine,12
2671,Martin Scorsese,12
1787,Jay Chapman,12
4020,Steven Spielberg,11


In [29]:
# What are the top 5 most common genres or categories (from the listed_in column)?
group_listed = df['listed_in'].str.split(', ').explode().value_counts()
group_listed = group_listed.reset_index()
group_listed.columns = ['genre', 'count']
group_listed = group_listed.sort_values(by='count', ascending=False)
group_listed.head(5)

Unnamed: 0,genre,count
0,International Movies,2752
1,Dramas,2427
2,Comedies,1674
3,International TV Shows,1351
4,Documentaries,869


# Pivot Table Analysis Using Netflix Dataset
- Create a pivot table showing the count of Movies and TV Shows per country.
- Create a pivot table showing the number of titles per release year and type.
- Create a pivot table for rating distribution by content type.
- Create a pivot table comparing the number of titles added per month and content type.
- Create a pivot table showing how many titles each country has produced across different ratings.
- Create a pivot table to compare genres (listed_in) across Movies vs TV Shows.
- Show how content type and release year interact using a pivot table.


In [30]:
df = pd.read_csv('netflix_titles.csv')    # link df to the csv file(pd.read_csv)
df

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...
...,...,...,...,...,...,...,...,...,...,...,...,...
8802,s8803,Movie,Zodiac,David Fincher,"Mark Ruffalo, Jake Gyllenhaal, Robert Downey J...",United States,"November 20, 2019",2007,R,158 min,"Cult Movies, Dramas, Thrillers","A political cartoonist, a crime reporter and a..."
8803,s8804,TV Show,Zombie Dumb,,,,"July 1, 2019",2018,TV-Y7,2 Seasons,"Kids' TV, Korean TV Shows, TV Comedies","While living alone in a spooky town, a young g..."
8804,s8805,Movie,Zombieland,Ruben Fleischer,"Jesse Eisenberg, Woody Harrelson, Emma Stone, ...",United States,"November 1, 2019",2009,R,88 min,"Comedies, Horror Movies",Looking to survive in a world taken over by zo...
8805,s8806,Movie,Zoom,Peter Hewitt,"Tim Allen, Courteney Cox, Chevy Chase, Kate Ma...",United States,"January 11, 2020",2006,PG,88 min,"Children & Family Movies, Comedies","Dragged from civilian life, a former superhero..."


In [31]:
# Create a pivot table showing the count of Movies and TV Shows per country.
pivot_table = df.pivot_table(index='country', columns='type', values='title', aggfunc='count', fill_value=0)
print(pivot_table.head())

type                                                Movie  TV Show
country                                                           
, France, Algeria                                       1        0
, South Korea                                           0        1
Argentina                                              38       18
Argentina, Brazil, France, Poland, Germany, Den...      1        0
Argentina, Chile                                        2        0


In [32]:
# Create a pivot table showing the number of titles per release year and type.
pivot_table = df.pivot_table(index='release_year', columns='type', values='title', aggfunc='count', fill_value=0)
print(pivot_table.head())

type          Movie  TV Show
release_year                
1925              0        1
1942              2        0
1943              3        0
1944              3        0
1945              3        1


In [33]:
# Create a pivot table for rating distribution by content type.
pivot_table = df.pivot_table(index='rating', columns='type', values='title', aggfunc='count', fill_value=0)
print(pivot_table.head())

type    Movie  TV Show
rating                
66 min      1        0
74 min      1        0
84 min      1        0
G          41        0
NC-17       3        0
