In [1]:
!pip install pandas scikit-learn



In [2]:
import pandas as pd

In [3]:
# Load dataset from a CSV file
df = pd.read_csv("SocialMediaUsers.csv")

In [4]:
# View first few rows
print(df.head())

    Platform      Owner                 Primary Usage           Country  \
0   WhatsApp       Meta                     Messaging       Switzerland   
1     WeChat    Tencent    Messaging and social media        Madagascar   
2   Snapchat  Snap Inc.          Multimedia messaging  Pitcairn Islands   
3  Instagram       Meta       Photo and video sharing       Timor-Leste   
4    Threads       Meta  Text-based social networking           Bermuda   

   Daily Time Spent (min) Verified Account Date Joined  
0                  113.94              Yes  2019-03-03  
1                   49.63              Yes  2023-09-21  
2                   29.01              Yes  2020-12-13  
3                  295.43              Yes  2019-04-21  
4                   71.78               No  2015-07-14  


In [5]:
# Get shape (rows, columns)
print(df.shape)

(10000, 7)


In [6]:
# Summary statistics
print(df.describe())

       Daily Time Spent (min)
count            10000.000000
mean               152.211145
std                 85.142750
min                  5.020000
25%                 78.920000
50%                152.735000
75%                225.642500
max                300.000000


In [7]:
# Data types
print(df.dtypes)

Platform                   object
Owner                      object
Primary Usage              object
Country                    object
Daily Time Spent (min)    float64
Verified Account           object
Date Joined                object
dtype: object


In [8]:
# Check for missing values
print(df.isnull().sum())

Platform                  0
Owner                     0
Primary Usage             0
Country                   0
Daily Time Spent (min)    0
Verified Account          0
Date Joined               0
dtype: int64


In [11]:
#Drop rows with any missing values
df = df.dropna()

In [14]:
# Binary encoding for Verified Account
df['Verified Account'] = df['Verified Account'].map({'Yes': 1, 'No': 0})

In [15]:
# Convert Date Joined to datetime and extract useful features
df['Date Joined'] = pd.to_datetime(df['Date Joined'])
df['Year Joined'] = df['Date Joined'].dt.year
df['Month Joined'] = df['Date Joined'].dt.month

In [16]:
# Drop the original Date Joined column
df = df.drop('Date Joined', axis=1)

In [17]:
# One-hot encode categorical columns
df = pd.get_dummies(df, columns=['Platform', 'Owner', 'Primary Usage', 'Country'])

In [19]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df[['Daily Time Spent (min)', 'Year Joined', 'Month Joined']] = scaler.fit_transform(
    df[['Daily Time Spent (min)', 'Year Joined', 'Month Joined']]
)

In [20]:
print(df.head())
print(df.shape)

   Daily Time Spent (min)  Verified Account  Year Joined  Month Joined  \
0               -0.449516                 1    -0.281625     -1.027430   
1               -1.204874                 1     1.092489      0.716390   
2               -1.447067                 1     0.061904      1.588300   
3                1.682187                 1    -0.281625     -0.736793   
4               -0.944710                 0    -1.655739      0.135117   

   Platform_Facebook  Platform_Instagram  Platform_LinkedIn  \
0              False               False              False   
1              False               False              False   
2              False               False              False   
3              False                True              False   
4              False               False              False   

   Platform_Pinterest  Platform_Quora  Platform_Reddit  ...  Country_Uruguay  \
0               False           False            False  ...            False   
1               