In [1]:
#22i-1914
#QUESTION#1
#Talha Kayani
import pandas as pd
#loading data
property_data=pd.read_csv('property.csv')
#filtering data for just the specified city which is i am taking Islamabad
selected_city='Islamabad'
#creating new data frame as isl_data only having islamabad's data
isl_data=property_data[property_data['city']==selected_city]


In [2]:
#Data Pre-processing

#changing datatype of date-added to date-time.
isl_data['date_added']=pd.to_datetime(isl_data['date_added'])
#i get to know that there are missing values in agency and agent column so i filled them up
isl_data['agency'].fillna(isl_data['agency'].mode()[0], inplace=True)
isl_data['agent'].fillna(isl_data['agent'].mode()[0], inplace=True)

#removing outliers from the data to make a precise model
def remove_outliers(data, columns, threshold=1.5):
   
    data_no_outliers = data.copy()
    
    for column in columns:
       
        Q1 = data_no_outliers[column].quantile(0.25)
        Q3 = data_no_outliers[column].quantile(0.75)
        
        
        IQR = Q3 - Q1
        
        
        lower_bound = Q1 - threshold * IQR
        upper_bound = Q3 + threshold * IQR
        
        # Remove outliers for the current column
        data_no_outliers = data_no_outliers[(data_no_outliers[column] >= lower_bound) & (data_no_outliers[column] <= upper_bound)]
    
    return data_no_outliers

# List of numeric columns in isl_data
numeric_columns = ['price', 'baths', 'latitude', 'longitude', 'bedrooms']

# Removing outliers from all numeric columns
isldata = remove_outliers(isl_data, numeric_columns)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isl_data['date_added']=pd.to_datetime(isl_data['date_added'])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isl_data['agency'].fillna(isl_data['agency'].mode()[0], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isl_data['agent'].fillna(isl_data['agent'].mode()[0], inplace=True)


In [3]:
#Exploratory Data Analysis (EDA)

#Correlations
numeric_columns = isldata.select_dtypes(include='number')
correlation_matrix = numeric_columns.corr()
# Printing correlation values for specific pairs
#I have choosed 0.7 to only viw notably high or low relations
notable_correlations = correlation_matrix[(correlation_matrix > 0.7) | (correlation_matrix < -0.7)].stack().reset_index()
notable_correlations.columns = ['Variable 1', 'Variable 2', 'Correlation']
print(notable_correlations)


#checking correlation between the number of properties listed by an agent,agency and the average property price
properties_per_agent = isldata['agent'].value_counts()
properties_per_agency = isldata['agency'].value_counts()
average_price_per_agent = isldata.groupby('agent')['price'].mean()
average_price_per_agency = isldata.groupby('agency')['price'].mean()

# merging the calculated metrics into a single DataFrame
agent_metrics = pd.DataFrame({
    'properties_count': properties_per_agent,
    'average_price': average_price_per_agent
})

agency_metrics = pd.DataFrame({
    'properties_count': properties_per_agency,
    'average_price': average_price_per_agency
})

# Correlation for agents
agent_correlation = agent_metrics['properties_count'].corr(agent_metrics['average_price'])

# Correlation for agencies
agency_correlation = agency_metrics['properties_count'].corr(agency_metrics['average_price'])

print(f"Correlation between properties count and average price for agents: {agent_correlation}")
print(f"Correlation between properties count and average price for agencies: {agency_correlation}")


    Variable 1   Variable 2  Correlation
0  property_id  property_id     1.000000
1  location_id  location_id     1.000000
2        price        price     1.000000
3     latitude     latitude     1.000000
4    longitude    longitude     1.000000
5        baths        baths     1.000000
6        baths     bedrooms     0.788133
7     bedrooms        baths     0.788133
8     bedrooms     bedrooms     1.000000
Correlation between properties count and average price for agents: -0.023466138355901427
Correlation between properties count and average price for agencies: -0.03776101284324156


In [4]:
#Feature Engineering

#computing new column indicating price per square meter
kanal_to_sq_meter = 505.857
marla_to_sq_meter = 25.2929

# Function to convert 'area' to square meters
def convert_to_sq_meter(row):
    try:
        # Removing commas and then converting to float
        numeric_part = float(row['area'].replace(',', '').split()[0])
        unit_part = row['area'].split()[1].lower()

        if 'kanal' in unit_part:
            return numeric_part * kanal_to_sq_meter
        elif 'marla' in unit_part:
            return numeric_part * marla_to_sq_meter
        else:
            return None  # Handle other cases if needed
    except (ValueError, IndexError):
        return None

# Applying the conversion function to the 'area' column
isldata['area_sq_meter'] = isldata.apply(convert_to_sq_meter, axis=1)
# Adding a new column 'price_per_sq_meter'
isldata['price_per_sq_meter'] = isldata['price'] / isldata['area_sq_meter']

# Deriving additional temporal features
isldata['month_added'] = isldata['date_added'].dt.month
isldata['quarter_added'] = isldata['date_added'].dt.quarter
isldata['day_of_week_added'] = isldata['date_added'].dt.day_of_week  # Monday is 0, Sunday is 6


#using Robust Standardization method

from sklearn.preprocessing import RobustScaler
import numpy as np
# Extracting numerical columns for scaling
numerical_columns = ['price', 'area_sq_meter', 'price_per_sq_meter', 'baths', 'bedrooms', 'latitude', 'longitude']
# Replacing infinite values with a large finite value
isldata.replace([np.inf, -np.inf], np.finfo(np.float64).max, inplace=True)
# Initialize the RobustScaler
scaler = RobustScaler()
isldata[numerical_columns] = scaler.fit_transform(isldata[numerical_columns])


import pandas as pd

# Assuming 'isldata' is your DataFrame
categorical_columns = ['property_type', 'location', 'city', 'province_name', 'purpose', 'agency', 'agent']

# Apply one-hot encoding
isldata = pd.get_dummies(isldata, columns=categorical_columns)

# Droping unnecessary columns
columns_to_drop = ['property_id', 'location_id', 'page_url', 'area',  'date_added']
isldata = isldata.drop(columns=columns_to_drop)


In [5]:
#Model Training
from sklearn.model_selection import train_test_split


# Specifying the features (X) and target variable (y)
features = ['area_sq_meter', 'bedrooms']
target = 'price'

X = isldata[features]
y = isldata[target]

# Splitting the data into training and testing sets
#  0.2 for an 80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

# Defining the model
rf_model = RandomForestRegressor()

# Defining hyperparameters to tune and their possible values
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Performing Grid Search with 5-fold cross-validation(repeating process 5 times)
grid_search = GridSearchCV(rf_model, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)
best_rf_model = grid_search.best_estimator_
# Training the model on the entire training set
best_rf_model.fit(X_train, y_train)





In [6]:
#Model Evaluation
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

# Making predictions on the test set using the trained model
predictions = best_rf_model.predict(X_test)

# Calculating Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, predictions)
print("Mean Absolute Error (MAE):", mae)

# Calculating Mean Squared Error (MSE)
mse = mean_squared_error(y_test, predictions)
print("Mean Squared Error (MSE):", mse)

# Calculating Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)
print("Root Mean Squared Error (RMSE):", rmse)






Mean Absolute Error (MAE): 0.5925589152079705
Mean Squared Error (MSE): 0.9004823368624048
Root Mean Squared Error (RMSE): 0.9489374778468836


In [7]:
#22i-1914
#QUESTION#2
#Talha Kayani
#loading data
fraud=pd.read_csv('fraud.csv')


# Using get_dummies to convert categorical variables into dummy variables
fraud= pd.get_dummies(fraud, columns=['Undergrad', 'Marital.Status', 'Urban'])


# Target Variable Transformation
#right is equal to set false because i have not included upper limit as it can go to any range in the data
fraud['Taxable.Income'] = pd.cut(fraud['Taxable.Income'], bins=[-float('inf'), 30000, float('inf')], labels=['Risky', 'Good'], right=False)


from sklearn.preprocessing import MinMaxScaler
#deciding on what features to scale
features_to_scale = ['Work.Experience', 'City.Population']
#using minmax scaler
scaler = MinMaxScaler()
# Fit the scaler on the selected features and transform the data
fraud[features_to_scale] = scaler.fit_transform(fraud[features_to_scale])


from sklearn.model_selection import train_test_split
# Features excluding the target variable
X = fraud.drop('Taxable.Income', axis=1)  
 # Target variable
y = fraud['Taxable.Income'] 
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
dt_classifier = DecisionTreeClassifier(random_state=42)
# Training the model on the training set
dt_classifier.fit(X_train, y_train)

# Making predictions on the test set
y_pred = dt_classifier.predict(X_test)

# Displaying the classification report
classification_rep = classification_report(y_test, y_pred)
print("Classification Report:\n", classification_rep)


Classification Report:
               precision    recall  f1-score   support

        Good       0.77      0.80      0.79        94
       Risky       0.17      0.15      0.16        26

    accuracy                           0.66       120
   macro avg       0.47      0.48      0.47       120
weighted avg       0.64      0.66      0.65       120



In [8]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

dt_model = DecisionTreeClassifier()

grid_search = GridSearchCV(dt_model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Getting the best hyperparameters
best_params = grid_search.best_params_
best_dt_model = grid_search.best_estimator_

# Evaluating the model on the test set
y_pred = best_dt_model.predict(X_test)


# best_dt_model is trained decision tree model
y_pred = best_dt_model.predict(X_test)

# Print classification report again after hyperparameter tuning
print(classification_report(y_test, y_pred))



              precision    recall  f1-score   support

        Good       0.77      0.85      0.81        94
       Risky       0.12      0.08      0.10        26

    accuracy                           0.68       120
   macro avg       0.45      0.46      0.45       120
weighted avg       0.63      0.68      0.65       120

