# Importing Library

In [1]:
import pandas as pd
import numpy as np
import matplotlib as mlt
from sklearn.preprocessing import LabelEncoder,StandardScaler,MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn .metrics import accuracy_score,mean_squared_error,r2_score


#  Data Set ni VS kodga o'rnatish

In [5]:
df=pd.read_csv('Indian_Kids_Screen_Time.csv.txt')

# Introducing with data

In [None]:
df.info()  #data 6 column da inputlar va bitta columnda output ya"ni target feature predict qilishimiz kerak bolgan qiymat

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9712 entries, 0 to 9711
Data columns (total 8 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Age                                9712 non-null   int64  
 1   Gender                             9712 non-null   object 
 2   Avg_Daily_Screen_Time_hr           9712 non-null   float64
 3   Primary_Device                     9712 non-null   object 
 4   Exceeded_Recommended_Limit         9712 non-null   bool   
 5   Educational_to_Recreational_Ratio  9712 non-null   float64
 6   Health_Impacts                     6494 non-null   object 
 7   Urban_or_Rural                     9712 non-null   object 
dtypes: bool(1), float64(2), int64(1), object(4)
memory usage: 540.7+ KB


In [None]:
df.head() #Demak outmputimiz 2 ta bu binary classification sinfiga kiradi Supervised Machine Learningning

Unnamed: 0,Age,Gender,Avg_Daily_Screen_Time_hr,Primary_Device,Exceeded_Recommended_Limit,Educational_to_Recreational_Ratio,Health_Impacts,Urban_or_Rural
0,14,Male,3.99,Smartphone,True,0.42,"Poor Sleep, Eye Strain",Urban
1,11,Female,4.61,Laptop,True,0.3,Poor Sleep,Urban
2,18,Female,3.73,TV,True,0.32,Poor Sleep,Urban
3,15,Female,1.21,Laptop,False,0.39,,Urban
4,12,Female,5.89,Smartphone,True,0.49,"Poor Sleep, Anxiety",Urban


# Data preprocessing
#  Data Cleaning  or Missing Value

In [3]:
df.isnull().sum() #datasetimizda tushib qolgan qiymatlar faqat bitta ustunda berolgan ekan 2 ta usliu qollashimiz mumkin

Age                                     0
Gender                                  0
Avg_Daily_Screen_Time_hr                0
Primary_Device                          0
Exceeded_Recommended_Limit              0
Educational_to_Recreational_Ratio       0
Health_Impacts                       3218
Urban_or_Rural                          0
dtype: int64

# 1 usul  da  yani ustun bilan birga tashhlab yuborish mumkin lekin bunda juda kop datalar tushib qoladi va data occuracy juda yomon chiqadi 
df.dropna(axis=1,inplace=True)

# bu ustundagi datalarimiz hammasi string data type ya'ni datalarimiz hamasi object qiymatdaligi un mean emas mode bilad qollab ketamiz

In [3]:
df['Health_Impacts']

0                Poor Sleep, Eye Strain
1                            Poor Sleep
2                            Poor Sleep
3                                   NaN
4                   Poor Sleep, Anxiety
                     ...               
9707                         Poor Sleep
9708                         Poor Sleep
9709    Poor Sleep, Eye Strain, Anxiety
9710                         Poor Sleep
9711                            Anxiety
Name: Health_Impacts, Length: 9712, dtype: object

In [None]:
df['Health_Impacts'].fillna(df['Health_Impacts'].mode()[0],inplace=True) #ya'ni ustundagi eng ko'p qatnashgan qiymat bn hamma missing valuelarni filling qilib data setni toldirdik

In [None]:
df.isnull().sum() # mana korib turganingizdek tushib qolgan qiymat endi data setda mavjud emas keyingi encoding bosqichga o'tsak boladi

Age                                  0
Gender                               0
Avg_Daily_Screen_Time_hr             0
Primary_Device                       0
Exceeded_Recommended_Limit           0
Educational_to_Recreational_Ratio    0
Health_Impacts                       0
Urban_or_Rural                       0
dtype: int64

# Encoding

In [None]:
df.info() # Demak 3 ta object va 1 ta boolen data tpeli ustunlarimiz encoding qilishga muhtoj

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9712 entries, 0 to 9711
Data columns (total 8 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Age                                9712 non-null   int64  
 1   Gender                             9712 non-null   object 
 2   Avg_Daily_Screen_Time_hr           9712 non-null   float64
 3   Primary_Device                     9712 non-null   object 
 4   Exceeded_Recommended_Limit         9712 non-null   bool   
 5   Educational_to_Recreational_Ratio  9712 non-null   float64
 6   Health_Impacts                     9712 non-null   object 
 7   Urban_or_Rural                     9712 non-null   object 
dtypes: bool(1), float64(2), int64(1), object(4)
memory usage: 540.7+ KB


In [7]:
df_encoded=df.copy()

In [10]:
from sklearn.preprocessing import LabelEncoder  
for col in df_encoded.select_dtypes(include=['object']).columns:
    cardinality=df_encoded[col].nunique()
    if cardinality<=5:
        le=LabelEncoder()
        df_encoded[col]=le.fit_transform(df_encoded[col])
    else:
        df_encoded=pd.get_dummies(df_encoded,columns=[col],dtype=int, drop_first=True)
df=df_encoded

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9712 entries, 0 to 9711
Data columns (total 21 columns):
 #   Column                                                        Non-Null Count  Dtype  
---  ------                                                        --------------  -----  
 0   Age                                                           9712 non-null   int64  
 1   Gender                                                        9712 non-null   int64  
 2   Avg_Daily_Screen_Time_hr                                      9712 non-null   float64
 3   Primary_Device                                                9712 non-null   int64  
 4   Exceeded_Recommended_Limit                                    9712 non-null   bool   
 5   Educational_to_Recreational_Ratio                             9712 non-null   float64
 6   Urban_or_Rural                                                9712 non-null   int64  
 7   Health_Impacts_Anxiety, Obesity Risk                          9712 no

In [21]:
df.head()

Unnamed: 0,Age,Gender,Avg_Daily_Screen_Time_hr,Primary_Device,Exceeded_Recommended_Limit,Educational_to_Recreational_Ratio,Urban_or_Rural,"Health_Impacts_Anxiety, Obesity Risk",Health_Impacts_Eye Strain,"Health_Impacts_Eye Strain, Anxiety",...,"Health_Impacts_Eye Strain, Obesity Risk",Health_Impacts_Obesity Risk,Health_Impacts_Poor Sleep,"Health_Impacts_Poor Sleep, Anxiety","Health_Impacts_Poor Sleep, Anxiety, Obesity Risk","Health_Impacts_Poor Sleep, Eye Strain","Health_Impacts_Poor Sleep, Eye Strain, Anxiety","Health_Impacts_Poor Sleep, Eye Strain, Anxiety, Obesity Risk","Health_Impacts_Poor Sleep, Eye Strain, Obesity Risk","Health_Impacts_Poor Sleep, Obesity Risk"
0,0.322805,0.982444,-0.211179,-0.40888,True,-0.098694,0.646222,-0.08459,-0.266494,-0.118728,...,-0.105047,-0.163213,-0.551974,-0.258426,-0.08998,2.986692,-0.165197,-0.061841,-0.140498,-0.220935
1,-0.625879,-1.01787,0.149675,-1.542814,True,-1.737647,0.646222,-0.08459,-0.266494,-0.118728,...,-0.105047,-0.163213,1.811681,-0.258426,-0.08998,-0.334819,-0.165197,-0.061841,-0.140498,-0.220935
2,1.587718,-1.01787,-0.362506,0.725055,True,-1.464488,0.646222,-0.08459,-0.266494,-0.118728,...,-0.105047,-0.163213,1.811681,-0.258426,-0.08998,-0.334819,-0.165197,-0.061841,-0.140498,-0.220935
3,0.639033,-1.01787,-1.829204,-1.542814,False,-0.508432,0.646222,-0.08459,-0.266494,-0.118728,...,-0.105047,-0.163213,-0.551974,-0.258426,-0.08998,-0.334819,-0.165197,-0.061841,-0.140498,-0.220935
4,-0.309651,-1.01787,0.894665,-0.40888,True,0.857362,0.646222,-0.08459,-0.266494,-0.118728,...,-0.105047,-0.163213,-0.551974,3.869585,-0.08998,-0.334819,-0.165197,-0.061841,-0.140498,-0.220935


# Scaling

In [12]:
num_col=df.select_dtypes(include=['int64','float64']).columns

In [13]:
num_col

Index(['Age', 'Gender', 'Avg_Daily_Screen_Time_hr', 'Primary_Device',
       'Educational_to_Recreational_Ratio', 'Urban_or_Rural',
       'Health_Impacts_Anxiety, Obesity Risk', 'Health_Impacts_Eye Strain',
       'Health_Impacts_Eye Strain, Anxiety',
       'Health_Impacts_Eye Strain, Anxiety, Obesity Risk',
       'Health_Impacts_Eye Strain, Obesity Risk',
       'Health_Impacts_Obesity Risk', 'Health_Impacts_Poor Sleep',
       'Health_Impacts_Poor Sleep, Anxiety',
       'Health_Impacts_Poor Sleep, Anxiety, Obesity Risk',
       'Health_Impacts_Poor Sleep, Eye Strain',
       'Health_Impacts_Poor Sleep, Eye Strain, Anxiety',
       'Health_Impacts_Poor Sleep, Eye Strain, Anxiety, Obesity Risk',
       'Health_Impacts_Poor Sleep, Eye Strain, Obesity Risk',
       'Health_Impacts_Poor Sleep, Obesity Risk'],
      dtype='object')

In [14]:
scaler=StandardScaler()

In [15]:
df[num_col]=scaler.fit_transform(df[num_col])

In [16]:
df.head()

Unnamed: 0,Age,Gender,Avg_Daily_Screen_Time_hr,Primary_Device,Exceeded_Recommended_Limit,Educational_to_Recreational_Ratio,Urban_or_Rural,"Health_Impacts_Anxiety, Obesity Risk",Health_Impacts_Eye Strain,"Health_Impacts_Eye Strain, Anxiety",...,"Health_Impacts_Eye Strain, Obesity Risk",Health_Impacts_Obesity Risk,Health_Impacts_Poor Sleep,"Health_Impacts_Poor Sleep, Anxiety","Health_Impacts_Poor Sleep, Anxiety, Obesity Risk","Health_Impacts_Poor Sleep, Eye Strain","Health_Impacts_Poor Sleep, Eye Strain, Anxiety","Health_Impacts_Poor Sleep, Eye Strain, Anxiety, Obesity Risk","Health_Impacts_Poor Sleep, Eye Strain, Obesity Risk","Health_Impacts_Poor Sleep, Obesity Risk"
0,0.322805,0.982444,-0.211179,-0.40888,True,-0.098694,0.646222,-0.08459,-0.266494,-0.118728,...,-0.105047,-0.163213,-0.551974,-0.258426,-0.08998,2.986692,-0.165197,-0.061841,-0.140498,-0.220935
1,-0.625879,-1.01787,0.149675,-1.542814,True,-1.737647,0.646222,-0.08459,-0.266494,-0.118728,...,-0.105047,-0.163213,1.811681,-0.258426,-0.08998,-0.334819,-0.165197,-0.061841,-0.140498,-0.220935
2,1.587718,-1.01787,-0.362506,0.725055,True,-1.464488,0.646222,-0.08459,-0.266494,-0.118728,...,-0.105047,-0.163213,1.811681,-0.258426,-0.08998,-0.334819,-0.165197,-0.061841,-0.140498,-0.220935
3,0.639033,-1.01787,-1.829204,-1.542814,False,-0.508432,0.646222,-0.08459,-0.266494,-0.118728,...,-0.105047,-0.163213,-0.551974,-0.258426,-0.08998,-0.334819,-0.165197,-0.061841,-0.140498,-0.220935
4,-0.309651,-1.01787,0.894665,-0.40888,True,0.857362,0.646222,-0.08459,-0.266494,-0.118728,...,-0.105047,-0.163213,-0.551974,3.869585,-0.08998,-0.334819,-0.165197,-0.061841,-0.140498,-0.220935


# Model Training

# input va outputlarni ajratib olamiz

In [22]:
x=df.drop(columns=['Urban_or_Rural'])

In [23]:
y=df['Urban_or_Rural']

In [24]:
x

Unnamed: 0,Age,Gender,Avg_Daily_Screen_Time_hr,Primary_Device,Exceeded_Recommended_Limit,Educational_to_Recreational_Ratio,"Health_Impacts_Anxiety, Obesity Risk",Health_Impacts_Eye Strain,"Health_Impacts_Eye Strain, Anxiety","Health_Impacts_Eye Strain, Anxiety, Obesity Risk","Health_Impacts_Eye Strain, Obesity Risk",Health_Impacts_Obesity Risk,Health_Impacts_Poor Sleep,"Health_Impacts_Poor Sleep, Anxiety","Health_Impacts_Poor Sleep, Anxiety, Obesity Risk","Health_Impacts_Poor Sleep, Eye Strain","Health_Impacts_Poor Sleep, Eye Strain, Anxiety","Health_Impacts_Poor Sleep, Eye Strain, Anxiety, Obesity Risk","Health_Impacts_Poor Sleep, Eye Strain, Obesity Risk","Health_Impacts_Poor Sleep, Obesity Risk"
0,0.322805,0.982444,-0.211179,-0.408880,True,-0.098694,-0.08459,-0.266494,-0.118728,-0.06014,-0.105047,-0.163213,-0.551974,-0.258426,-0.08998,2.986692,-0.165197,-0.061841,-0.140498,-0.220935
1,-0.625879,-1.017870,0.149675,-1.542814,True,-1.737647,-0.08459,-0.266494,-0.118728,-0.06014,-0.105047,-0.163213,1.811681,-0.258426,-0.08998,-0.334819,-0.165197,-0.061841,-0.140498,-0.220935
2,1.587718,-1.017870,-0.362506,0.725055,True,-1.464488,-0.08459,-0.266494,-0.118728,-0.06014,-0.105047,-0.163213,1.811681,-0.258426,-0.08998,-0.334819,-0.165197,-0.061841,-0.140498,-0.220935
3,0.639033,-1.017870,-1.829204,-1.542814,False,-0.508432,-0.08459,-0.266494,-0.118728,-0.06014,-0.105047,-0.163213,-0.551974,-0.258426,-0.08998,-0.334819,-0.165197,-0.061841,-0.140498,-0.220935
4,-0.309651,-1.017870,0.894665,-0.408880,True,0.857362,-0.08459,-0.266494,-0.118728,-0.06014,-0.105047,-0.163213,-0.551974,3.869585,-0.08998,-0.334819,-0.165197,-0.061841,-0.140498,-0.220935
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9707,1.271490,0.982444,-0.636057,-0.408880,True,0.174465,-0.08459,-0.266494,-0.118728,-0.06014,-0.105047,-0.163213,1.811681,-0.258426,-0.08998,-0.334819,-0.165197,-0.061841,-0.140498,-0.220935
9708,1.271490,-1.017870,0.044911,-0.408880,True,-0.371853,-0.08459,-0.266494,-0.118728,-0.06014,-0.105047,-0.163213,1.811681,-0.258426,-0.08998,-0.334819,-0.165197,-0.061841,-0.140498,-0.220935
9709,0.955262,0.982444,0.737519,-0.408880,True,-0.508432,-0.08459,-0.266494,-0.118728,-0.06014,-0.105047,-0.163213,-0.551974,-0.258426,-0.08998,-0.334819,6.053380,-0.061841,-0.140498,-0.220935
9710,1.271490,0.982444,0.725878,0.725055,True,0.037886,-0.08459,-0.266494,-0.118728,-0.06014,-0.105047,-0.163213,1.811681,-0.258426,-0.08998,-0.334819,-0.165197,-0.061841,-0.140498,-0.220935


In [25]:
y

0       0.646222
1       0.646222
2       0.646222
3       0.646222
4       0.646222
          ...   
9707    0.646222
9708   -1.547455
9709   -1.547455
9710    0.646222
9711    0.646222
Name: Urban_or_Rural, Length: 9712, dtype: float64

# Train  Test Validating qismlarga ajratish

In [28]:
x_train,x_temp,y_train,y_temp=train_test_split(x,y,test_size=0.3,random_state=42)

In [29]:
x_test,x_val,y_test,y_val=train_test_split(x_temp,y_temp,test_size=0.5,random_state=42)

In [None]:
x_train.shape  # 6798 train 

(6798, 20)

In [31]:
x_val.shape # x va y validation lar ham birbiriga mos tushushi kk

(1457, 20)

In [32]:
y_train.shape 

(6798,)

In [33]:
x_test.shape

(1457, 20)

In [34]:
y_test.shape

(1457,)

In [35]:
y_val.shape

(1457,)

In [40]:
#model tanlash
model=DecisionTreeClassifier()

In [43]:
dt=model.fit(x_train,y_train)

ValueError: Unknown label type: continuous. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.

In [44]:
y_pred=dt.predict(x_test)

NameError: name 'dt' is not defined

In [45]:
accuracy=accuracy_score(y_test,y_pred)

NameError: name 'y_pred' is not defined