<a href="https://colab.research.google.com/github/urmilaahire26/CothonSolutionTasks/blob/main/DSTask4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

# Set a random seed for reproducibility
np.random.seed(42)

# Create a synthetic dataset with 1000 samples
n_samples = 1000

# Generate random data
data = {
    'user_id': np.arange(1, n_samples + 1),
    'start_date': pd.to_datetime(np.random.choice(pd.date_range('2019-01-01', '2023-01-01', freq='D'), size=n_samples)),
    'subscription_status': np.random.choice(['active', 'inactive'], size=n_samples, p=[0.7, 0.3]),
    'monthly_usage': np.random.randint(10, 500, size=n_samples),  # Monthly usage in minutes
    'total_sessions': np.random.randint(1, 50, size=n_samples),  # Number of sessions
    'churn': np.random.choice([0, 1], size=n_samples, p=[0.7, 0.3]),  # 0: not churned, 1: churned
}

# Create DataFrame
df = pd.DataFrame(data)

# Ensure 'start_date' is a datetime type
df['start_date'] = pd.to_datetime(df['start_date'])

# Fixing 'end_date' by adding random number of days to 'start_date' to simulate end date
df['end_date'] = pd.to_datetime(df['start_date'] + pd.to_timedelta(np.random.randint(1, 365, size=n_samples), unit='D'))

# Check the first few rows to ensure 'start_date' and 'end_date' are created correctly
print(df[['start_date', 'end_date']].head())

# Feature engineering: Calculate subscription length
df['subscription_length'] = (df['end_date'] - df['start_date']).dt.days

# Example: 'is_active' feature from 'subscription_status'
df['is_active'] = df['subscription_status'].apply(lambda x: 1 if x == 'active' else 0)

# Calculate 'average_usage' based on 'monthly_usage' and 'total_sessions'
df['average_usage'] = df['monthly_usage'] / (df['total_sessions'] + 1)

# Drop unnecessary columns (we'll keep the columns we need for prediction)
df = df.drop(columns=['user_id', 'start_date', 'end_date'])

# Save to CSV in Colab environment
df.to_csv('/content/synthetic_churn_data.csv', index=False)

# Display the first few rows of the DataFrame
print(df.head())

# Now, load the dataset that was just saved
df = pd.read_csv('/content/synthetic_churn_data.csv')

# Check the data
print(df.head())


  start_date   end_date
0 2022-01-31 2022-03-16
1 2022-12-30 2023-12-25
2 2021-05-10 2022-03-23
3 2022-07-18 2022-12-10
4 2022-02-04 2022-12-28
  subscription_status  monthly_usage  total_sessions  churn  \
0            inactive            247               9      0   
1            inactive            390              19      1   
2              active            445              47      0   
3            inactive            396              14      1   
4              active            492              38      0   

   subscription_length  is_active  average_usage  
0                   44          0      24.700000  
1                  360          0      19.500000  
2                  317          1       9.270833  
3                  145          0      26.400000  
4                  327          1      12.615385  
  subscription_status  monthly_usage  total_sessions  churn  \
0            inactive            247               9      0   
1            inactive            390         