In [8]:
from sklearn.model_selection import train_test_split,KFold,LeaveOneOut
from sklearn.metrics import accuracy_score,confusion_matrix,r2_score,root_mean_squared_error
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder,StandardScaler
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
import random


In [9]:
df = pd.read_csv("house_prices_dataset.csv")
display(df)

Unnamed: 0,SquareFootage,Bedrooms,Bathrooms,HouseAge,Neighborhood,Price
0,3974,4,1.9,36,Suburb,500100
1,1660,5,1.1,23,Midtown,285600
2,2094,1,2.8,92,Uptown,283800
3,1930,5,2.5,45,Uptown,322000
4,1895,5,1.2,52,Chinatown,301900
...,...,...,...,...,...,...
95,1876,4,1.5,15,Uptown,290600
96,1591,2,3.8,12,Chinatown,292500
97,3064,3,3.9,29,Uptown,441200
98,1563,1,3.7,18,Chinatown,271400


In [10]:
print(df.isnull().sum())
print(df.info())

SquareFootage    0
Bedrooms         0
Bathrooms        0
HouseAge         0
Neighborhood     0
Price            0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   SquareFootage  100 non-null    int64  
 1   Bedrooms       100 non-null    int64  
 2   Bathrooms      100 non-null    float64
 3   HouseAge       100 non-null    int64  
 4   Neighborhood   100 non-null    object 
 5   Price          100 non-null    int64  
dtypes: float64(1), int64(4), object(1)
memory usage: 4.8+ KB
None


In [11]:
# le = LabelEncoder()
# df_encoded = df
# df_encoded["Neighborhood"]= le.fit_transform(df["Neighborhood"])
df.drop(columns=["HouseAge"],inplace=True)
df_encoded = pd.get_dummies(df,columns=["Neighborhood"],drop_first=True)
display(df_encoded)

Unnamed: 0,SquareFootage,Bedrooms,Bathrooms,Price,Neighborhood_Downtown,Neighborhood_Midtown,Neighborhood_Suburb,Neighborhood_Uptown
0,3974,4,1.9,500100,False,False,True,False
1,1660,5,1.1,285600,False,True,False,False
2,2094,1,2.8,283800,False,False,False,True
3,1930,5,2.5,322000,False,False,False,True
4,1895,5,1.2,301900,False,False,False,False
...,...,...,...,...,...,...,...,...
95,1876,4,1.5,290600,False,False,False,True
96,1591,2,3.8,292500,False,False,False,False
97,3064,3,3.9,441200,False,False,False,True
98,1563,1,3.7,271400,False,False,False,False


In [12]:
lr = LinearRegression()
y = df_encoded.pop("Price")
X_train,X_test,Y_train,Y_test= train_test_split(df_encoded,y,test_size=0.2,random_state=42)
scalar=StandardScaler()
X_train = scalar.fit_transform(X_train)
X_test= scalar.transform(X_test)
lr.fit(X_train,Y_train)
y_pred = lr.predict(X_test)

print(f"RMSE: {root_mean_squared_error(y_pred,Y_test):.2f}")
print(f"R2 Score: {r2_score(y_pred,Y_test):.2f}")

RMSE: 10640.84
R2 Score: 0.99


In [13]:
test_sample = {
    "SquareFootage": 2500,
    "Bedrooms": 4,
    "Bathrooms": 2.5,
    "HouseAge": 10,
    "Neighborhood": "Uptown"
}

# Convert to DataFrame
test = pd.DataFrame([test_sample])
test = pd.get_dummies(test, columns=["Neighborhood"], drop_first=True)
test.drop(columns=["HouseAge"],inplace=True)
# Align test columns with training columns (fill missing columns with 0)
for col in df_encoded.columns:
    if col not in test.columns and col != "Price":
        test[col] = 0

# print(test)

test_p = lr.predict(test)
print(f"{test_p[0]:.2f}")

215345280.97




In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier, export_text

num_customers = 500

data = {
    'customer_id': range(1, num_customers+1),
    'age': np.random.randint(18, 70, size=num_customers),
    'total_spending': np.concatenate([
        np.random.normal(1200, 200, int(num_customers*0.3)),
        np.random.normal(400, 150, int(num_customers*0.7))
    ]),
    'num_visits': np.concatenate([
        np.random.poisson(12, int(num_customers*0.3)),
        np.random.poisson(4, int(num_customers*0.7))
    ]),
    'purchase_freq': np.concatenate([
        np.random.normal(15, 3, int(num_customers*0.3)),
        np.random.normal(45, 10, int(num_customers*0.7))
    ])
}

df = pd.DataFrame(data)

# Create target variable (high-value = 1, low-value = 0)
df['high_value'] = ((df['total_spending'] > 800) & (df['num_visits'] > 8) & (df['purchase_freq'] < 30)).astype(int)

for col in ['total_spending', 'num_visits', 'purchase_freq']:
    df[col].fillna(df[col].mean(),inplace=True)
    
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df[['age', 'total_spending', 'num_visits', 'purchase_freq']])

df_scaled = pd.DataFrame(scaled_features, columns=['age_scaled', 'total_spending_scaled', 'num_visits_scaled', 'purchase_freq_scaled'])
df = pd.concat([df, df_scaled], axis=1)

X = df[['age_scaled', 'total_spending_scaled', 'num_visits_scaled', 'purchase_freq_scaled']]
y = df['high_value']

X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train SVM classifier
svm_classifier = SVC(kernel='linear', random_state=42)
svm_classifier.fit(X_train, Y_train)

# Evaluate on test set
y_pred_svm = svm_classifier.predict(X_test)

print("\nSVM Classification Report:")
print(classification_report(Y_test, y_pred_svm))
print("\nSVM Confusion Matrix:")
print(confusion_matrix(Y_test, y_pred_svm))

# Train Decision Tree classifier
tree_classifier = DecisionTreeClassifier()
tree_classifier.fit(X_train, Y_train)
display(X_train)
# Extract rules
tree_rules = export_text(tree_classifier, feature_names=['age', 'total_spending', 'num_visits', 'purchase_freq'])
print("\nDecision Tree Rules:")
print(tree_rules)

# Evaluate on test set
y_pred_tree = tree_classifier.predict(X_test)

print("\nDecision Tree Classification Report:")
print(classification_report(Y_test, y_pred_tree))
print("\nDecision Tree Confusion Matrix:")
print(confusion_matrix(Y_test, y_pred_tree))


SVM Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.96      0.98       112
           1       0.90      0.97      0.94        38

    accuracy                           0.97       150
   macro avg       0.95      0.97      0.96       150
weighted avg       0.97      0.97      0.97       150


SVM Confusion Matrix:
[[108   4]
 [  1  37]]


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(),inplace=True)


Unnamed: 0,age_scaled,total_spending_scaled,num_visits_scaled,purchase_freq_scaled
5,-0.078412,1.029773,1.272156,-1.499344
116,-0.486100,1.094320,0.124001,-1.445034
45,0.329276,1.691994,1.731417,-1.044355
16,1.280547,1.696145,2.190679,-1.183064
462,-0.825840,-1.086235,-0.564892,-0.080576
...,...,...,...,...
106,-0.893788,0.879569,1.961048,-1.129842
270,1.280547,-1.085554,-1.024154,0.224553
348,-0.214308,-0.755281,-1.024154,0.418616
435,0.329276,-0.488827,-0.564892,0.578701



Decision Tree Rules:
|--- num_visits <= 0.47
|   |--- class: 0
|--- num_visits >  0.47
|   |--- total_spending <= 0.50
|   |   |--- class: 0
|   |--- total_spending >  0.50
|   |   |--- class: 1


Decision Tree Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       112
           1       1.00      1.00      1.00        38

    accuracy                           1.00       150
   macro avg       1.00      1.00      1.00       150
weighted avg       1.00      1.00      1.00       150


Decision Tree Confusion Matrix:
[[112   0]
 [  0  38]]
