# DATA CLEANING 

#### 0 step. Import necessary libraries

In [1]:
#-------------------------- 

import pandas as pd
import numpy as np

#### 1 step. Upload our dataset

In [2]:
#-------------------------- 

df = pd.read_csv('C:/Users/User/Desktop/Assignment 7-8/Data/Raw data/apartment_price_in_Kokshetau_with_rooms.csv')
df = df.drop(columns=['Unnamed: 0'], axis=1)

#### 2 step. Analyse the dataset

In [3]:
#-------------------------- 

print(df.info())
print()
print(df.describe().round(1))
print()
print(df.isnull().sum())
print()
print(df['type_of_house'].value_counts())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7736 entries, 0 to 7735
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Unnamed: 0.1    7736 non-null   int64  
 1   price           7549 non-null   float64
 2   room            7582 non-null   float64
 3   area            7582 non-null   object 
 4   flat_toilets    3956 non-null   object 
 5   balcony         990 non-null    object 
 6   current_floors  7447 non-null   float64
 7   total_floors    7447 non-null   float64
 8   ceiling         3049 non-null   float64
 9   dorm            4544 non-null   object 
 10  mortgage        308 non-null    object 
 11  year            7582 non-null   float64
 12  type_of_house   6549 non-null   object 
 13  condition       3417 non-null   object 
 14  repair_status   2700 non-null   object 
 15  type_of_floor   2634 non-null   object 
dtypes: float64(6), int64(1), object(9)
memory usage: 967.1+ KB
None

       Unname

#### 3 step. Count of False in 'is_in_kokshetau' column

In [4]:
#-------------------------- 

count_false = (df['is_in_kokshetau'] == False).sum()

print('Sum of False in the "is_in_kokshetau column":', count_false)
print()
print(df.isna().sum())


KeyError: 'is_in_kokshetau'

#### 4 step. Drop duplicates, price where NaN, rows where column 'is_in_kokshetau' == False

In [None]:
#-------------------------- 

df.drop_duplicates(inplace=True)
df = df.dropna(subset=['price']).reset_index(drop=True)
df = df[df['is_in_kokshetau'] != False].reset_index(drop=True)

df.reset_index(drop=True, inplace=True)
df.head()

#### 5 step. Fill in the NaN values with the value "нет"

In [None]:
#-------------------------- 

df['mortgage'] = df['mortgage'].fillna('нет')
df['balcony'] = df['balcony'].fillna('нет')
df['dorm'] = df['dorm'].fillna('нет')
df.isnull().sum()

#### 6 step. Fill in the NaN values with the value of mode or median 

In [None]:
#-------------------------- 

df['ceiling'].fillna(df['ceiling'].median(), inplace=True)
df['repair_status'].fillna(df['repair_status'].mode()[0], inplace=True)
df['flat_toilets'].fillna(df['flat_toilets'].mode()[0], inplace=True)
df['type_of_house'].fillna(df['type_of_house'].mode()[0], inplace=True)

df.isnull().sum()

#### 7 step. Analyse the non-null count and dtype

In [None]:
#-------------------------- 

df.info()

#### 8 step. Drop unnecessary columns and rows

In [None]:
#-------------------------- 

df.drop(columns=['condition'], inplace=True)  
df.drop(columns=['type_of_floor'], inplace=True)  
df.dropna(subset=['current_floors', 'total_floors'], inplace=True)
df = df[df['year'] != 2025]

df.reset_index(drop=True, inplace=True)
df.isnull().sum()

#### 9 step. Check the unique value of 'year' and 'ceiling' columns

In [None]:
#-------------------------- 

print(df['year'].unique())
print()
print(df['ceiling'].unique())


#### 10 step. Replace the abnormal value with the median value

In [None]:
#-------------------------- 

print(df[df['ceiling'] == 100])

df['ceiling'] = df['ceiling'].replace(100, df['ceiling'].median())

print(df['ceiling'].unique())


#### 11 step. Analyse the non-null count and dtype

In [None]:
#-------------------------- 


df.to_csv('apartment_price.csv')
df.info()

### 12 step. Convert to float and int

In [None]:
#-------------------------- 

df['area'] = df['area'].str.split(',').str[0]

df['area'] = df['area'].astype(float)
df['price'] = df['price'].astype(int)

### 13 step. Sort by area and show top 20

In [None]:
#-------------------------- 

top_20_area = df.sort_values(by='area', ascending=False).head(20)
top_20_area

### 14 step. Drop the abnormal value

In [None]:
#-------------------------- 

df = df.drop(index=[853, 555])
df.reset_index(drop=True, inplace=True)

df['area'].max()

### 15 step. Ready Data

In [None]:
#-------------------------- 

df.to_csv('apartment_price_in_Kokshetau.csv')
df.head()

# VISUALIZATIONS

#### 0 step. Import necessary libraries

In [None]:
#-------------------------- 

import seaborn as sns
import matplotlib.pyplot as plt

#### 1 step. Draw histogram with Distribution of Price

In [None]:
#-------------------------- 

plt.figure(figsize=(8,6))
plt.hist(df['price'].dropna(), bins=50, color='skyblue', edgecolor='black')
plt.title('Distribution of Price')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

#### 2 step. Draw a figure with price vs area

In [None]:
#-------------------------- 

plt.figure(figsize=(8,6))
sns.scatterplot(x='area', y='price', data=df, color='orange')


plt.title('Price vs Area')
plt.xlabel('Area (sq.m.)')
plt.ylabel('Price')


plt.xticks(rotation=45, ha='right')  
plt.tight_layout()  

plt.grid(True)
plt.show()

#### 3 step. Boxplot with type of house and year

In [None]:
#-------------------------- 

plt.figure(figsize=(10,6))
sns.boxplot(x='type_of_house', y='year', data=df)
plt.title('Year of Construction by Type of House')
plt.xlabel('Type of House')
plt.ylabel('Year')
plt.xticks(rotation=45)
plt.grid(True)
plt.show()

### 4 step. Correlation matrix

In [None]:
numeric_columns = df.select_dtypes(include=[float, int]).columns
correlation_matrix = df[numeric_columns].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1, linewidths=0.5)
plt.title("Correlation matrix")
plt.show()