In [1]:
import os
import re
import pandas as pd
import numpy as np
import seaborn as sns
from scipy.stats import norm
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 500)

In [2]:
LOCAL_PATH = "C:/Users/DELL/PycharmProjects/Zoi Task/data/"
filename = "rent_data_clean_2.csv"
rent_data = pd.read_csv(os.path.join(LOCAL_PATH, filename))

In [3]:
rent_data.shape

(259646, 21)

In [4]:
rent_data.columns

Index(['garden', 'noRooms', 'lift', 'livingSpace', 'baseRent', 'cellar',
       'hasKitchen', 'newlyConst', 'balcony', 'serviceCharge', 'description',
       'floor', 'facilities', 'yearConstructedRange', 'condition',
       'typeOfFlat_loft', 'typeOfFlat_maisonette',
       'typeOfFlat_non_luxury_type', 'typeOfFlat_penthouse',
       'typeOfFlat_terraced_flat', 'total_rent_new'],
      dtype='object')

# Textual Data Analysis and Processing 

## Fill the descriptions and facilities that are null 

In [5]:
rent_data['description'] = rent_data['description'].fillna('keine beschreibung angegebe')
rent_data['facilities'] = rent_data['facilities'].fillna('keine beschreibung angegebe')

 ## Lets check the average length in character of each column 

In [6]:
rent_data['pre_clean_description_length'] = [len(t) for t in rent_data.description]
print(f'The average length of a description is: {int(rent_data["pre_clean_description_length"].mean())}')

The average length of a description is: 467


In [7]:
rent_data['pre_clean_facilities_length'] = [len(t) for t in rent_data.facilities]
print(f'The average length of a facilities is: {int(rent_data["pre_clean_facilities_length"].mean())}')

The average length of a facilities is: 271


## Text preprocessing steps

### Removing special characters/strings 

In [8]:
from nltk.tokenize import WordPunctTokenizer


tok = WordPunctTokenizer()
pat1 = r'@[A-Za-z0-9_]+'
pat2 = r'https?://[^ ]+'
combined_pat = r'|'.join((pat1, pat2))
www_pat = r'www.[^ ]+'

def text_cleaner(text):
    stripped = re.sub(combined_pat, '', text)
    stripped = re.sub(www_pat, '', stripped)
    lower_case = stripped.lower()
    
    letters_only = re.sub("[^A-Za-z0-9]", " ", lower_case)
    # During the letters_only process two lines above, it has created unnecessay white spaces,
    # I will tokenize and join together to remove unneccessary white spaces
    words = [x for x  in tok.tokenize(letters_only) if len(x) > 1]
    return (" ".join(words)).strip()

In [9]:
rent_data = rent_data.reset_index()

In [10]:
clean_description_text = []
for i in range(len(rent_data)):
    clean_description_text.append(text_cleaner(rent_data['description'][i]))

In [11]:
clean_facilities_text = []
for i in range(len(rent_data)):
    clean_facilities_text.append(text_cleaner(rent_data['facilities'][i]))

In [12]:
clean_text_df = pd.DataFrame({
    'clean_description': clean_description_text,
    'clean_facilities': clean_facilities_text
})

## Concatenate facilities with description 

In [13]:
clean_text_df['property_description'] = clean_text_df['clean_description'] + '.' + clean_text_df['clean_facilities']

In [14]:
clean_text_df['property_description'][0]

'die ebenerdig zu erreichende erdgeschosswohnung befindet sich in einem gepflegten familienhaus aufgrund der hanglage bietet sich ein unverbaubarer blick ins gr ne.die wohnung ist mit laminat ausgelegt das badezimmer ist gefliest und verf gt ber eine wannendusche neue wei zimmert ren ein fliesenspiegel in der che und fu leisten wurden rzlich eingebaut zur wohnung geh rt ein 10 gro er keller eine garage kann optional mitgemietet werden'

### Find the average rent for properties with descriptions < 50 and the average rent for properties with description > 50, aslo the condition
There is a considerate difference in the average totalRent for properties with descriptions less than 100 words and for properties with more than 100 words

In [15]:
rent_data['length_category'] = pd.cut(rent_data['pre_clean_description_length'], bins=[0, 100, float('inf')], labels=['<100', '>=100'])

# Group by length category and condition, then calculate the average rent
result = rent_data.groupby(['length_category'])['total_rent_new'].mean().reset_index()

# Print the result
print(result)

  length_category  total_rent_new
0            <100      589.867991
1           >=100      804.624733


## Truncate text with length more than 5000 characters.

In [16]:
def truncate_text(text, max_length=1000):
    if len(text) > max_length:
        return text[:max_length]
    else:
        return text

clean_text_df['property_description_1000'] = clean_text_df['property_description'].apply(truncate_text)

## Count the words in each property description and add it as a feature

In [17]:
def count_words(text):
    return len(text.split())

# Apply the count_words function to the 'description' column
clean_text_df['word_count'] = clean_text_df['property_description'].apply(count_words)

In [18]:
clean_text_df.columns

Index(['clean_description', 'clean_facilities', 'property_description',
       'property_description_1000', 'word_count'],
      dtype='object')

In [19]:
clean_text_df[["property_description","word_count"]]

Unnamed: 0,property_description,word_count
0,die ebenerdig zu erreichende erdgeschosswohnun...,67
1,alles neu macht der mai so kann es auch sie in...,231
2,der neubau entsteht im herzen der dresdner neu...,143
3,abseits von rm und abgasen in ihre neue wohnun...,23
4,es handelt sich hier um ein saniertes mehrfami...,91
...,...,...
259641,diese sch ne neuwertige wohnung im dachgeschos...,34
259642,hier wird eine wohnung im familienhaus angebot...,21
259643,gem tliche zimmer wohnung im obergeschoss eine...,82
259644,neubau erstbezug gehobener standard alle einhe...,202


## Adding new features for property description
1. Summary of the property description
2. Sentence embeddings for the summarized property description
3. Use sentence embeddings as feature. 

Model for german text summary -> https://huggingface.co/Einmalumdiewelt/T5-Base_GNAD
Model for german sentence embeddings -> 

In [24]:
# Use a pipeline as a high-level helper
from transformers import pipeline
pipe = pipeline("summarization", model="Einmalumdiewelt/T5-Base_GNAD")
device = "cpu"
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
tokenizer = AutoTokenizer.from_pretrained("Einmalumdiewelt/T5-Base_GNAD")
model = AutoModelForSeq2SeqLM.from_pretrained("Einmalumdiewelt/T5-Base_GNAD")

def generate_summary(text):
    inputs = tokenizer([text], padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    input_ids = inputs.input_ids.to(device)
    attention_mask = inputs.attention_mask.to(device)
    output = model.generate(input_ids, attention_mask=attention_mask)
    return tokenizer.decode(output[0], skip_special_tokens=True)

### Testing the model with one german sentence

In [25]:
text = ["gem tliche raum wohnung in chemnitz komplette wohneinheit ist mit laminat ausgestattet bscher balkon mit blick ins gr ne keller vorhanden heller wohn und schlafbereich vom wohnzimmer zugang zum balkon sehr bsch deckenhoch gefliestes bad mit wanne."]
summary = []
for t in text:
    summary.append(generate_summary(t))



In [26]:
summary

['Die chemnitz wohnung ist komplett ausgestattet und hat einen schönen blick']

### Generating a summary for the first 1000 description

In [27]:
from tqdm import tqdm

property_descr_summary = []
for t in tqdm(clean_text_df["property_description"].head(10)):
    property_descr_summary.append(generate_summary(t))

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:13<00:00,  1.30s/it]


In [29]:
clean_text_df["property_description_summary"] = property_descr_summary + [np.nan] * (len(clean_text_df) - len(property_descr_summary))

In [31]:
from sentence_transformers import SentenceTransformer

# Load the German RoBERTa model
model_name = "T-Systems-onsite/german-roberta-sentence-transformer-v2"
model = SentenceTransformer(model_name)
# Extract sentence embeddings
sentence_embeddings = model.encode(property_descr_summary)

No sentence-transformers model found with name C:\Users\DELL/.cache\torch\sentence_transformers\T-Systems-onsite_german-roberta-sentence-transformer-v2. Creating a new one with MEAN pooling.


In [32]:
sentence_embeddings

array([[ 0.12873927, -0.1620096 , -0.03720499, ..., -0.36538956,
         0.05844872,  0.11322635],
       [ 0.15093684, -0.07536464, -0.11381516, ..., -0.02618101,
         0.06989988,  0.24130122],
       [ 0.2154808 , -0.12638694,  0.07471661, ..., -0.31053618,
         0.11787111,  0.12319646],
       ...,
       [-0.03829889,  0.30238587, -0.07255299, ..., -0.06411088,
         0.14626594,  0.1873768 ],
       [-0.03090677, -0.00616135, -0.1315068 , ...,  0.077553  ,
         0.23328434, -0.17386441],
       [ 0.21111676, -0.15806942,  0.00352365, ..., -0.17597473,
         0.02649845,  0.15908512]], dtype=float32)