# Loding Libraries

In [1]:
import pandas as pd
import numpy as np


# Uploading dataset

In [2]:
df=pd.read_csv('Transformed Data Set - Sheet1.csv')

# Introducing with Data

In [3]:
df=df.drop_duplicates()

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 64 entries, 0 to 65
Data columns (total 5 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Favorite Color        63 non-null     object
 1   Favorite Music Genre  62 non-null     object
 2   Favorite Beverage     63 non-null     object
 3   Favorite Soft Drink   63 non-null     object
 4   Gender                63 non-null     object
dtypes: object(5)
memory usage: 3.0+ KB


In [5]:
df.head()

Unnamed: 0,Favorite Color,Favorite Music Genre,Favorite Beverage,Favorite Soft Drink,Gender
0,Cool,Rock,Vodka,7UP/Sprite,F
1,Neutral,Hip hop,Vodka,Coca Cola/Pepsi,F
2,Warm,Rock,Wine,Coca Cola/Pepsi,F
3,Warm,Folk/Traditional,Whiskey,Fanta,F
4,Cool,Rock,Vodka,Coca Cola/Pepsi,F


In [6]:
df.describe()

Unnamed: 0,Favorite Color,Favorite Music Genre,Favorite Beverage,Favorite Soft Drink,Gender
count,63,62,63,63,63
unique,3,7,6,4,2
top,Cool,Rock,Doesn't drink,Coca Cola/Pepsi,F
freq,34,17,14,29,32


In [7]:
df.isnull().sum()

Favorite Color          1
Favorite Music Genre    2
Favorite Beverage       1
Favorite Soft Drink     1
Gender                  1
dtype: int64

# Handle missing value with for Loop

In [8]:
for col in df.columns:
    if df[col].dtype=='object':
        if df[col].isnull().any():
          df[col].fillna(df[col].mode()[0],inplace=True)
    else:
        df[col].fillna(df[col].mean(),inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0],inplace=True)


In [9]:
df.isnull().sum()

Favorite Color          0
Favorite Music Genre    0
Favorite Beverage       0
Favorite Soft Drink     0
Gender                  0
dtype: int64

# Encoding

In [10]:
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()

In [11]:
cardinality=df.nunique()  # onehot yoki Label qaysi biridan  foydalanamiz shuni aniqlab olish un cordinality kk

In [12]:
cardinality

Favorite Color          3
Favorite Music Genre    7
Favorite Beverage       6
Favorite Soft Drink     4
Gender                  2
dtype: int64

In [13]:
def raqamla(df):
    for col in df.columns:
        if df[col].dtype=='object':
            if df[col].nunique()<=5:
                dummies=pd.get_dummies(df[col],prefix=col, dtype=int)
                df=pd.concat([df.drop(columns=[col]),dummies],axis=1)
            else:
                df[col]=encoder.fit_transform(df[col])
    return df           

In [14]:
df=raqamla(df)

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 64 entries, 0 to 65
Data columns (total 11 columns):
 #   Column                               Non-Null Count  Dtype
---  ------                               --------------  -----
 0   Favorite Music Genre                 64 non-null     int64
 1   Favorite Beverage                    64 non-null     int64
 2   Favorite Color_Cool                  64 non-null     int64
 3   Favorite Color_Neutral               64 non-null     int64
 4   Favorite Color_Warm                  64 non-null     int64
 5   Favorite Soft Drink_7UP/Sprite       64 non-null     int64
 6   Favorite Soft Drink_Coca Cola/Pepsi  64 non-null     int64
 7   Favorite Soft Drink_Fanta            64 non-null     int64
 8   Favorite Soft Drink_Other            64 non-null     int64
 9   Gender_F                             64 non-null     int64
 10  Gender_M                             64 non-null     int64
dtypes: int64(11)
memory usage: 6.0 KB


# Scaling

In [19]:
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()

In [20]:
num_col=df.select_dtypes(include=['int64','float64']).columns.drop('Gender_F')

In [21]:
num_col

Index(['Favorite Music Genre', 'Favorite Beverage', 'Favorite Color_Cool',
       'Favorite Color_Neutral', 'Favorite Color_Warm',
       'Favorite Soft Drink_7UP/Sprite', 'Favorite Soft Drink_Coca Cola/Pepsi',
       'Favorite Soft Drink_Fanta', 'Favorite Soft Drink_Other', 'Gender_M'],
      dtype='object')

In [22]:
def taqribla(df):
    df[num_col]=scaler.fit_transform(df[num_col])
    return df

In [23]:
taqribla(df)

Unnamed: 0,Favorite Music Genre,Favorite Beverage,Favorite Color_Cool,Favorite Color_Neutral,Favorite Color_Warm,Favorite Soft Drink_7UP/Sprite,Favorite Soft Drink_Coca Cola/Pepsi,Favorite Soft Drink_Fanta,Favorite Soft Drink_Other,Gender_F,Gender_M
0,1.000000,0.6,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1,0.0
1,0.333333,0.6,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1,0.0
2,1.000000,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1,0.0
3,0.166667,0.8,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1,0.0
4,1.000000,0.6,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1,0.0
...,...,...,...,...,...,...,...,...,...,...,...
60,0.000000,0.8,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0,1.0
62,0.333333,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1,0.0
63,0.333333,0.2,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0,1.0
64,1.000000,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0,1.0


# Model Training Process

In [26]:
# Inputlarni x va outputni y bn belgilab ajratib olamiz
x=df.drop(columns=['Gender_F']) 
y=df['Gender_F']

In [28]:
from sklearn.model_selection import train_test_split

In [29]:
x.head

<bound method NDFrame.head of     Favorite Music Genre  Favorite Beverage  Favorite Color_Cool  \
0               1.000000                0.6                  1.0   
1               0.333333                0.6                  0.0   
2               1.000000                1.0                  0.0   
3               0.166667                0.8                  0.0   
4               1.000000                0.6                  1.0   
..                   ...                ...                  ...   
60              0.000000                0.8                  1.0   
62              0.333333                0.0                  1.0   
63              0.333333                0.2                  0.0   
64              1.000000                1.0                  1.0   
65              0.000000                0.0                  1.0   

    Favorite Color_Neutral  Favorite Color_Warm  \
0                      0.0                  0.0   
1                      1.0                  0.0   


In [30]:
# Datani qismlarga ajratish
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

# Model Selection

In [31]:
from sklearn.linear_model import LinearRegression

In [32]:
lr=LinearRegression()

In [33]:
lr.fit(x_train,y_train)

In [34]:
y_pred=lr.predict(x_test)

In [35]:
from sklearn.metrics import r2_score,mean_absolute_error

In [36]:
r2=r2_score(y_test,y_pred)

In [37]:
mae=mean_absolute_error(y_test,y_pred)

# K_fold_cross validation

In [42]:
from sklearn.model_selection import  KFold, cross_val_score

In [43]:
kf=KFold(n_splits=5,shuffle=True,random_state=42)

In [44]:
kf

KFold(n_splits=5, random_state=42, shuffle=True)

In [46]:
scores=cross_val_score(lr,x,y,cv=kf,scoring='r2')

In [47]:
print(scores)

[1. 1. 1. 1. 1.]


In [48]:
for i in scores:
    print(i)

1.0
1.0
1.0
1.0
1.0


In [50]:
import numpy as np
np.mean(scores)

np.float64(1.0)

In [51]:
np.std(scores)

np.float64(0.0)

In [52]:
from sklearn.metrics import make_scorer

In [53]:
mae=make_scorer(mean_absolute_error,greater_is_better=False)

In [54]:
scores=cross_val_score(lr,x,y,cv=kf,scoring=mae)

In [55]:
print(-scores)

[3.41607085e-16 4.01388324e-16 3.24526730e-16 5.46571335e-16
 6.47630098e-16]


In [56]:


print(-scores.mean())

4.523447143921471e-16
