In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [22]:
df = pd.read_csv("/content/netflix_users.csv")
df.head()

Unnamed: 0,User_ID,Name,Age,Country,Subscription_Type,Watch_Time_Hours,Favorite_Genre,Last_Login
0,1,James Martinez,18,France,Premium,80.26,Drama,2024-05-12
1,2,John Miller,23,USA,Premium,321.75,Sci-Fi,2025-02-05
2,3,Emma Davis,60,UK,Basic,35.89,Comedy,2025-01-24
3,4,Emma Miller,44,USA,Premium,261.56,Documentary,2024-03-25
4,5,Jane Smith,68,USA,Standard,909.3,Drama,2025-01-14


In [23]:
df.shape

(25000, 8)

In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   User_ID            25000 non-null  int64  
 1   Name               25000 non-null  object 
 2   Age                25000 non-null  int64  
 3   Country            25000 non-null  object 
 4   Subscription_Type  25000 non-null  object 
 5   Watch_Time_Hours   25000 non-null  float64
 6   Favorite_Genre     25000 non-null  object 
 7   Last_Login         25000 non-null  object 
dtypes: float64(1), int64(2), object(5)
memory usage: 1.5+ MB


In [25]:
df.describe()

Unnamed: 0,User_ID,Age,Watch_Time_Hours
count,25000.0,25000.0,25000.0
mean,12500.5,46.48288,500.468858
std,7217.022701,19.594861,286.381815
min,1.0,13.0,0.12
25%,6250.75,29.0,256.5675
50%,12500.5,46.0,501.505
75%,18750.25,63.0,745.7325
max,25000.0,80.0,999.99


In [26]:
df['Last_Login'] = pd.to_datetime(df['Last_Login'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   User_ID            25000 non-null  int64         
 1   Name               25000 non-null  object        
 2   Age                25000 non-null  int64         
 3   Country            25000 non-null  object        
 4   Subscription_Type  25000 non-null  object        
 5   Watch_Time_Hours   25000 non-null  float64       
 6   Favorite_Genre     25000 non-null  object        
 7   Last_Login         25000 non-null  datetime64[ns]
dtypes: datetime64[ns](1), float64(1), int64(2), object(4)
memory usage: 1.5+ MB


In [27]:
df.head()

Unnamed: 0,User_ID,Name,Age,Country,Subscription_Type,Watch_Time_Hours,Favorite_Genre,Last_Login
0,1,James Martinez,18,France,Premium,80.26,Drama,2024-05-12
1,2,John Miller,23,USA,Premium,321.75,Sci-Fi,2025-02-05
2,3,Emma Davis,60,UK,Basic,35.89,Comedy,2025-01-24
3,4,Emma Miller,44,USA,Premium,261.56,Documentary,2024-03-25
4,5,Jane Smith,68,USA,Standard,909.3,Drama,2025-01-14


In [28]:
def classify_engagement(hours):
  if hours < 300:
    return "Low"
  elif hours >= 300 and hours < 600:
    return "Medium"
  else:
    return "High"



In [29]:
df['engagement_level'] = df['Watch_Time_Hours'].apply(classify_engagement)

In [30]:
today = pd.to_datetime('today')
df['days_since_last_login'] = (today - df['Last_Login']).dt.days

In [31]:
df.head()

Unnamed: 0,User_ID,Name,Age,Country,Subscription_Type,Watch_Time_Hours,Favorite_Genre,Last_Login,engagement_level,days_since_last_login
0,1,James Martinez,18,France,Premium,80.26,Drama,2024-05-12,Low,635
1,2,John Miller,23,USA,Premium,321.75,Sci-Fi,2025-02-05,Medium,366
2,3,Emma Davis,60,UK,Basic,35.89,Comedy,2025-01-24,Low,378
3,4,Emma Miller,44,USA,Premium,261.56,Documentary,2024-03-25,Low,683
4,5,Jane Smith,68,USA,Standard,909.3,Drama,2025-01-14,High,388


In [52]:
# Since Minimum value of last login is 335 days, so 2 months logged in people would be at low risk, rest will be at high risk.
df['risk_level'] = df['days_since_last_login'].apply(lambda x: 'Low Risk' if x <= 425 else 'High Risk')

if recency is high OR engagement is low

In [53]:
df['likely_to_leave'] = df.apply(
    lambda row: 'Yes'
    if row['risk_level'] == 'High Risk' or row['engagement_level'] == 'Low'
    else 'No',
    axis=1)

In [54]:
df['likely_to_leave'].value_counts(normalize=True)*100

Unnamed: 0_level_0,proportion
likely_to_leave,Unnamed: 1_level_1
Yes,82.164
No,17.836


In [55]:
df.groupby('likely_to_leave')['Watch_Time_Hours'].mean()

Unnamed: 0_level_0,Watch_Time_Hours
likely_to_leave,Unnamed: 1_level_1
No,648.063189
Yes,468.42937


In [45]:
df.head()

Unnamed: 0,User_ID,Name,Age,Country,Subscription_Type,Watch_Time_Hours,Favorite_Genre,Last_Login,engagement_level,days_since_last_login,risk_level,likely_to_leave
0,1,James Martinez,18,France,Premium,80.26,Drama,2024-05-12,Low,635,High Risk,Yes
1,2,John Miller,23,USA,Premium,321.75,Sci-Fi,2025-02-05,Medium,366,Low Risk,No
2,3,Emma Davis,60,UK,Basic,35.89,Comedy,2025-01-24,Low,378,Low Risk,Yes
3,4,Emma Miller,44,USA,Premium,261.56,Documentary,2024-03-25,Low,683,High Risk,Yes
4,5,Jane Smith,68,USA,Standard,909.3,Drama,2025-01-14,High,388,Low Risk,No


In [57]:
def customer_type(row):
  if row['engagement_level'] == 'High' and row['days_since_last_login'] < 395:
    return 'Loyal'
  elif (row['engagement_level'] == 'Medium' or row['engagement_level'] == 'High') and (row['days_since_last_login'] > 395 and row['days_since_last_login'] <= 425):
    return 'Dormant'
  else:
    return 'Risky'

In [58]:
df['customer_segment'] = df.apply(customer_type, axis = 1)
df['customer_segment'].value_counts()

Unnamed: 0_level_0,count
customer_segment,Unnamed: 1_level_1
Risky,21859
Loyal,1649
Dormant,1492


In [59]:
df.groupby("customer_segment")["Watch_Time_Hours"].mean()


Unnamed: 0_level_0,Watch_Time_Hours
customer_segment,Unnamed: 1_level_1
Dormant,650.833619
Loyal,798.47715
Risky,467.724455


In [60]:
X = df[["Watch_Time_Hours", "days_since_last_login"]]


In [61]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [62]:
X.describe()


Unnamed: 0,Watch_Time_Hours,days_since_last_login
count,25000.0,25000.0
mean,500.468858,516.84388
std,286.381815,105.87668
min,0.12,335.0
25%,256.5675,424.0
50%,501.505,517.0
75%,745.7325,609.0
max,999.99,700.0


In [64]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=2, random_state=42)
df['cluster'] = kmeans.fit_predict(X_scaled)

In [66]:
df.groupby("cluster")[["Watch_Time_Hours","days_since_last_login"]].mean()


Unnamed: 0_level_0,Watch_Time_Hours,days_since_last_login
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1
0,251.370599,517.456194
1,745.887908,516.24061


In [67]:
df.groupby("cluster")["customer_segment"].value_counts()


Unnamed: 0_level_0,Unnamed: 1_level_0,count
cluster,customer_segment,Unnamed: 2_level_1
0,Risky,11994
0,Dormant,413
1,Risky,9865
1,Loyal,1649
1,Dormant,1079


## Actions for cluster 0:
- 20% subscription discount

- personalized genre emails

- free 1-month premium

## Actions for cluster 1:
- NO discount to loyal

- upsell premium

- new content recommendations


In [68]:
y = df["likely_to_leave"].map({"Yes":1, "No":0})
X = df[["Watch_Time_Hours", "days_since_last_login"]]

In [69]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier()
model.fit(X_train, y_train)

pred = model.predict(X_test)

print(classification_report(y_test, pred))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00       925
           1       1.00      1.00      1.00      4075

    accuracy                           1.00      5000
   macro avg       1.00      1.00      1.00      5000
weighted avg       1.00      1.00      1.00      5000

