# Understanding the PreProcessing

In [47]:
import pandas as pd

# Creating a messy dataset
data = {'Age': [25, None, 35, 29, None], 'Salary': [50000, 60000, None, 45000, 55000]}
df = pd.DataFrame(data)

print("Original Data:")
print(df)


Original Data:
    Age   Salary
0  25.0  50000.0
1   NaN  60000.0
2  35.0      NaN
3  29.0  45000.0
4   NaN  55000.0


# Handeling Missing Value

In [48]:
import pandas as pd

# Messy dataset
data = {'Age': [25, None, 35, 29, None], 'Salary': [50000, 60000, None, 45000, 55000]}
df = pd.DataFrame(data)

print("Original Data:")
print(df)

# Method 1: Remove rows with missing values
cleaned_df = df.dropna()
print("\nData after dropping missing values:")
print(cleaned_df)

# Method 2: Fill missing values with a fixed number (e.g., 0)
filled_df = df.fillna(0)
print("\nData after filling missing values with 0:")
print(filled_df)

# Method 3: Fill missing values with the mean
mean_filled_df = df.fillna(df.mean())
print("\nData after filling missing values with the mean:")
print(mean_filled_df)


Original Data:
    Age   Salary
0  25.0  50000.0
1   NaN  60000.0
2  35.0      NaN
3  29.0  45000.0
4   NaN  55000.0

Data after dropping missing values:
    Age   Salary
0  25.0  50000.0
3  29.0  45000.0

Data after filling missing values with 0:
    Age   Salary
0  25.0  50000.0
1   0.0  60000.0
2  35.0      0.0
3  29.0  45000.0
4   0.0  55000.0

Data after filling missing values with the mean:
         Age   Salary
0  25.000000  50000.0
1  29.666667  60000.0
2  35.000000  52500.0
3  29.000000  45000.0
4  29.666667  55000.0


# Encoding Categorial Data

In [49]:
import pandas as pd

# Creating a small dataset with categorical data
data = {'Color': ['Red', 'Blue', 'Green', 'Blue', 'Red']}
df = pd.DataFrame(data)

print("Original Data:")
print(df)

# Label Encoding
df['Color_Label'] = df['Color'].astype('category').cat.codes
print("\nData after Label Encoding:")
print(df)

# One-Hot Encoding
one_hot_encoded = pd.get_dummies(df['Color'], prefix='Color')
df = pd.concat([df, one_hot_encoded], axis=1)
print("\nData after One-Hot Encoding:")
print(df)


Original Data:
   Color
0    Red
1   Blue
2  Green
3   Blue
4    Red

Data after Label Encoding:
   Color  Color_Label
0    Red            2
1   Blue            0
2  Green            1
3   Blue            0
4    Red            2

Data after One-Hot Encoding:
   Color  Color_Label  Color_Blue  Color_Green  Color_Red
0    Red            2       False        False       True
1   Blue            0        True        False      False
2  Green            1       False         True      False
3   Blue            0        True        False      False
4    Red            2       False        False       True


In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Sample dataset
data = {'Height': [150, 160, 170, 180, 190], 'Weight': [50, 60, 70, 80, 90]}
df = pd.DataFrame(data)

print("Original Data:")
print(df)

# Min-Max Scaling
minmax = MinMaxScaler()
df_minmax = pd.DataFrame(minmax.fit_transform(df), columns=['Height_scaled', 'Weight_scaled'])

print("\nAfter Min-Max Scaling:")
print(df_minmax)

# Standard Scaling
standard = StandardScaler()
df_standard = pd.DataFrame(standard.fit_transform(df), columns=['Height_scaled', 'Weight_scaled'])

print("\nAfter Standard Scaling:")
print(df_standard)


Original Data:
   Height  Weight
0     150      50
1     160      60
2     170      70
3     180      80
4     190      90

After Min-Max Scaling:
   Height_scaled  Weight_scaled
0           0.00           0.00
1           0.25           0.25
2           0.50           0.50
3           0.75           0.75
4           1.00           1.00

After Standard Scaling:
   Height_scaled  Weight_scaled
0      -1.414214      -1.414214
1      -0.707107      -0.707107
2       0.000000       0.000000
3       0.707107       0.707107
4       1.414214       1.414214


In [3]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

# Sample data
data = {'Hours_Studied': [1, 2, 3, 4, 5, 6, 7, 8],
        'Test_Score':    [35, 45, 50, 55, 65, 70, 75, 85]}
df = pd.DataFrame(data)

X = df[['Hours_Studied']]
y = df['Test_Score']

# Create a model
model = LinearRegression()

# Perform 5-fold cross-validation
scores = cross_val_score(model, X, y, cv=5)

print("Cross-validation scores for each fold:")
print(scores)

print("\nAverage Score (Final Accuracy):")
print(scores.mean())


Cross-validation scores for each fold:
[0.87800454 0.72023799 0.72023799        nan        nan]

Average Score (Final Accuracy):
nan




# Data preprocessing:

In [42]:
import pandas as pd

df = pd.read_csv(r"F:\Python\Machine_Learning\Mobiles Dataset (2025).csv", encoding="ISO-8859-1")
df.dropna(inplace=True)  


In [43]:
# check for the missing Values
print(df.isnull().sum())  


Company Name                 0
Model Name                   0
Mobile Weight                0
RAM                          0
Front Camera                 0
Back Camera                  0
Processor                    0
Battery Capacity             0
Screen Size                  0
Launched Price (Pakistan)    0
Launched Price (India)       0
Launched Price (China)       0
Launched Price (USA)         0
Launched Price (Dubai)       0
Launched Year                0
dtype: int64


In [44]:
from sklearn.preprocessing import LabelEncoder

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Apply label encoding on the 'Company Name' column
df["Company Name"] = label_encoder.fit_transform(df["Company Name"])

print(df.head())  # Display the transformed data


   Company Name            Model Name Mobile Weight  RAM Front Camera  \
0             0       iPhone 16 128GB          174g  6GB         12MP   
1             0       iPhone 16 256GB          174g  6GB         12MP   
2             0       iPhone 16 512GB          174g  6GB         12MP   
3             0  iPhone 16 Plus 128GB          203g  6GB         12MP   
4             0  iPhone 16 Plus 256GB          203g  6GB         12MP   

  Back Camera   Processor Battery Capacity Screen Size  \
0        48MP  A17 Bionic         3,600mAh  6.1 inches   
1        48MP  A17 Bionic         3,600mAh  6.1 inches   
2        48MP  A17 Bionic         3,600mAh  6.1 inches   
3        48MP  A17 Bionic         4,200mAh  6.7 inches   
4        48MP  A17 Bionic         4,200mAh  6.7 inches   

  Launched Price (Pakistan) Launched Price (India) Launched Price (China)  \
0               PKR 224,999             INR 79,999              CNY 5,799   
1               PKR 234,999             INR 84,999        

In [45]:
# One-Hot Encoding the 'Model Name' column
df = pd.get_dummies(df, columns=["Model Name"], drop_first=True)

print(df.head())  # Display the transformed data


   Company Name Mobile Weight  RAM Front Camera Back Camera   Processor  \
0             0          174g  6GB         12MP        48MP  A17 Bionic   
1             0          174g  6GB         12MP        48MP  A17 Bionic   
2             0          174g  6GB         12MP        48MP  A17 Bionic   
3             0          203g  6GB         12MP        48MP  A17 Bionic   
4             0          203g  6GB         12MP        48MP  A17 Bionic   

  Battery Capacity Screen Size Launched Price (Pakistan)  \
0         3,600mAh  6.1 inches               PKR 224,999   
1         3,600mAh  6.1 inches               PKR 234,999   
2         3,600mAh  6.1 inches               PKR 244,999   
3         4,200mAh  6.7 inches               PKR 249,999   
4         4,200mAh  6.7 inches               PKR 259,999   

  Launched Price (India)  ... Model Name_iPhone XR 128GB  \
0             INR 79,999  ...                      False   
1             INR 84,999  ...                      False   
2       

📝 Why Do We Split Data?

Splitting data into training, validation, and test sets helps evaluate model performance and prevent overfitting.

Training Set: Used to train the model.

Validation Set: Used for hyperparameter tuning and model selection.

Test Set: Used to evaluate the final model's performance.

In [46]:
from sklearn.model_selection import train_test_split

# Features (X) and target (y) - assuming "RAM" as target for demonstration
X = df.drop("RAM", axis=1)  # Drop the target column from features
y = df["RAM"]               # Target variable

# Train-test split (80-20 ratio)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training set size:", X_train.shape)
print("Test set size:", X_test.shape)


Training set size: (744, 920)
Test set size: (186, 920)
