In [18]:
import pandas as pd
import numpy as np

In [132]:
train_data = pd.read_csv('data/train_data1.csv')
train_label = pd.read_csv('data/train_label1.csv')
test_data = pd.read_csv('data/test_data1.csv')
test_label = pd.read_csv('data/test_label1.csv')

train_data.shape

(3333, 16)

In [77]:
rand = np.arange(len(train_data))
np.random.shuffle(rand)
print(rand)

[3316  974 1225 ...  796 2516 3007]


In [78]:
print(train_data.columns)
relevant_columns = ['TYPE', 'PRICE', 'BATH', 'PROPERTYSQFT', 'STATE', 'ADMINISTRATIVE_AREA_LEVEL_2',
                    'LOCALITY', 'SUBLOCALITY', 'STREET_NAME', 'LATITUDE', 'LONGITUDE']
cols = ['BROKERTITLE', 'MAIN_ADDRESS', 'FORMATTED_ADDRESS']
# Filter DataFrame to include only relevant columns
# data_filtered = train_data[relevant_columns]
columns = train_data.columns.to_list()
# columns_to_drop_existing = [col for col in cols if col in columns]


# Further preprocessing (e.g., handle missing values, encode categorical features) can be performed here

# Example of dropping irrelevant columns
data_filtered = train_data.drop(cols, axis=1)
train_data = data_filtered
train_data.head

Index(['BROKERTITLE', 'TYPE', 'PRICE', 'BATH', 'PROPERTYSQFT', 'ADDRESS',
       'STATE', 'MAIN_ADDRESS', 'ADMINISTRATIVE_AREA_LEVEL_2', 'LOCALITY',
       'SUBLOCALITY', 'STREET_NAME', 'LONG_NAME', 'FORMATTED_ADDRESS',
       'LATITUDE', 'LONGITUDE'],
      dtype='object')


<bound method NDFrame.head of                             TYPE      PRICE  BATH  PROPERTYSQFT  \
0                 Condo for sale     315000   2.0          1400   
1                 Condo for sale  195000000  10.0         17545   
2                 House for sale     260000   2.0          2015   
3                 Condo for sale      69000   1.0           445   
4                 House for sale     690000   2.0          4004   
...                          ...        ...   ...           ...   
3328  Multi-family home for sale    1700000   7.0          7854   
3329              Condo for sale     945000   2.0           903   
3330              Co-op for sale     245000   1.0          2204   
3331              Condo for sale     598125   1.0           655   
3332              Co-op for sale     349000   1.0           750   

                                                ADDRESS  \
0                                  2 E 55th St Unit 803   
1     Central Park Tower Penthouse-217 W 57th N

In [79]:
# full_train_data = pd.concat([train_data, train_label], axis=1)
# full_train_data.iloc[0]

x_encoded = pd.get_dummies(train_data, columns = relevant_columns, drop_first=True)

In [87]:
x_encoded.iloc[0]

ADDRESS                  2 E 55th St Unit 803
LONG_NAME                     Regis Residence
TYPE_Coming Soon                            0
TYPE_Condo for sale                         1
TYPE_Contingent                             0
                                 ...         
LONGITUDE_-73.7077664                       0
LONGITUDE_-73.7055812                       0
LONGITUDE_-73.704456                        0
LONGITUDE_-73.7039503                       0
LONGITUDE_-73.7039037                       0
Name: 0, Length: 8747, dtype: object

In [86]:
x_train = pd.concat([x_encoded, train_label], axis=1)
x_train.shape

(3333, 8748)

In [88]:
def create_vocab(corpus):
    # Create vocabulary from corpus (list of documents)
    vocab = {}
    for document in corpus:
        words = document.split()  # Split document into words
        for word in words:
            if word not in vocab:
                vocab[word] = len(vocab)  # Assign a unique index to each word
    return vocab

def bow_vectorize(document, vocab):
    # Create Bag-of-Words (BOW) vector for a single document using the provided vocabulary
    bow_vector = np.zeros(len(vocab))  # Initialize BOW vector with zeros
    words = document.split()  # Split document into words
    for word in words:
        if word in vocab:
            bow_vector[vocab[word]] += 1  # Increment count of word in BOW vector
    return bow_vector

# Example usage:
corpus = ["2 E 55th St Unit 803", "Regis Residence Central Park"]
vocab = create_vocab(corpus)

# Vectorize documents using BOW representation
document1_vector = bow_vectorize(corpus[0], vocab)
document2_vector = bow_vectorize(corpus[1], vocab)

print("Vocabulary:", vocab)
print("BOW Vector for Document 1:", document1_vector)
print("BOW Vector for Document 2:", document2_vector)

Vocabulary: {'2': 0, 'E': 1, '55th': 2, 'St': 3, 'Unit': 4, '803': 5, 'Regis': 6, 'Residence': 7, 'Central': 8, 'Park': 9}
BOW Vector for Document 1: [1. 1. 1. 1. 1. 1. 0. 0. 0. 0.]
BOW Vector for Document 2: [0. 0. 0. 0. 0. 0. 1. 1. 1. 1.]


In [120]:
max_price = train_data['PRICE'].max()
min_price = train_data['PRICE'].min()
print(max_price, min_price)

195000000 3225


In [112]:
def normalize_numeric_features(data: pd.DataFrame):
    # for i, price in data['PRICE'].iteritems():
        # data.loc[i, 'PRICE'] = (data.loc[i, 'PRICE'] - min_price)/(max_price-min_price)
        # print(data.loc[i, 'PRICE'])
    data['PRICE'] = (data['PRICE'] - min_price) / (max_price - min_price) 
    return data

In [121]:
x_train = normalize_numeric_features(train_data)
x_train['PRICE']

0       0.001599
1       1.000000
2       0.001317
3       0.000337
4       0.003522
          ...   
3328    0.008702
3329    0.004830
3330    0.001240
3331    0.003051
3332    0.001773
Name: PRICE, Length: 3333, dtype: float64

In [130]:
def tokenize_text(data, column_name):
    # Custom text tokenization (e.g., lowercase, split by whitespace)
    data[column_name + '_tokens'] = data[column_name].apply(custom_word_tokenize)
    return data
def custom_word_tokenize(text):
    # Convert text to lowercase (optional: depends on your use case)
    text = text.lower()

    # Define characters to be treated as punctuation
    punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

    # Replace punctuation with spaces
    for char in punctuation:
        text = text.replace(char, ' ')

    # Split text into tokens (words)
    tokens = text.split()

    return tokens

In [131]:
data = tokenize_text(train_data, "BROKERTITLE")
data =tokenize_text(train_data, "ADDRESS")
data = tokenize_text(train_data, "LONG_NAME")
data

Unnamed: 0,BROKERTITLE,TYPE,PRICE,BATH,PROPERTYSQFT,ADDRESS,STATE,MAIN_ADDRESS,ADMINISTRATIVE_AREA_LEVEL_2,LOCALITY,SUBLOCALITY,STREET_NAME,LONG_NAME,FORMATTED_ADDRESS,LATITUDE,LONGITUDE,BROKERTITLE_tokens,ADDRESS_tokens,LONG_NAME_tokens
0,Brokered by Douglas Elliman -111 Fifth Ave,Condo for sale,0.001599,2.0,1400,2 E 55th St Unit 803,"New York, NY 10022","2 E 55th St Unit 803, New York, NY 10022",New York County,New York,Manhattan,East 55th Street,Regis Residence,"Regis Residence, 2 E 55th St #803, New York, N...",40.761255,-73.974483,"[brokered, by, douglas, elliman, 111, fifth, ave]","[2, e, 55th, st, unit, 803]","[regis, residence]"
1,Brokered by Serhant,Condo for sale,1.000000,10.0,17545,Central Park Tower Penthouse-217 W 57th New Yo...,"New York, NY 10019",Central Park Tower Penthouse-217 W 57th New Yo...,United States,New York,New York County,New York,West 57th Street,"217 W 57th St, New York, NY 10019, USA",40.766393,-73.980991,"[brokered, by, serhant]","[central, park, tower, penthouse, 217, w, 57th...","[west, 57th, street]"
2,Brokered by Sowae Corp,House for sale,0.001317,2.0,2015,620 Sinclair Ave,"Staten Island, NY 10312","620 Sinclair Ave, Staten Island, NY 10312",United States,New York,Richmond County,Staten Island,Sinclair Avenue,"620 Sinclair Ave, Staten Island, NY 10312, USA",40.541805,-74.196109,"[brokered, by, sowae, corp]","[620, sinclair, ave]","[sinclair, avenue]"
3,Brokered by COMPASS,Condo for sale,0.000337,1.0,445,2 E 55th St Unit 908W33,"Manhattan, NY 10022","2 E 55th St Unit 908W33, Manhattan, NY 10022",United States,New York,New York County,New York,East 55th Street,"2 E 55th St, New York, NY 10022, USA",40.761398,-73.974613,"[brokered, by, compass]","[2, e, 55th, st, unit, 908w33]","[east, 55th, street]"
4,Brokered by Sowae Corp,House for sale,0.003522,2.0,4004,584 Park Pl,"Brooklyn, NY 11238","584 Park Pl, Brooklyn, NY 11238",United States,New York,Kings County,Brooklyn,Park Place,"584 Park Pl, Brooklyn, NY 11238, USA",40.674363,-73.958725,"[brokered, by, sowae, corp]","[584, park, pl]","[park, place]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3328,Brokered by Ilite Realty Inc,Multi-family home for sale,0.008702,7.0,7854,448 40th St,"Brooklyn, NY 11232","448 40th St, Brooklyn, NY 11232",United States,New York,Kings County,Brooklyn,40th Street,"448 40th St, Brooklyn, NY 11232, USA",40.651222,-74.005780,"[brokered, by, ilite, realty, inc]","[448, 40th, st]","[40th, street]"
3329,Brokered by Brown Harris Stevens,Condo for sale,0.004830,2.0,903,2351 Adam Clayton Powell Jr Blvd Apt 614,"Manhattan, NY 10030","2351 Adam Clayton Powell Jr Blvd Apt 614, Manh...",New York,New York County,New York,Manhattan,2351,2351 Adam Clayton Powell Jr Blvd Apartment 614...,40.816711,-73.942050,"[brokered, by, brown, harris, stevens]","[2351, adam, clayton, powell, jr, blvd, apt, 614]",[2351]
3330,Brokered by Mjr Real Estate Llc,Co-op for sale,0.001240,1.0,2204,97-40 62 Dr Unit Lg,"Rego Park, NY 11374","97-40 62 Dr Unit Lg, Rego Park, NY 11374",United States,New York,Queens County,Queens,62nd Drive,"97-40 62nd Dr, Rego Park, NY 11374, USA",40.732538,-73.860152,"[brokered, by, mjr, real, estate, llc]","[97, 40, 62, dr, unit, lg]","[62nd, drive]"
3331,Brokered by E Realty International Corp,Condo for sale,0.003051,1.0,655,91-23 Corona Ave Unit 4G,"Elmhurst, NY 11373","91-23 Corona Ave Unit 4G, Elmhurst, NY 11373",New York,Queens County,Queens,Flushing,91-23,"91-23 Corona Ave. #4b, Flushing, NY 11373, USA",40.742770,-73.872752,"[brokered, by, e, realty, international, corp]","[91, 23, corona, ave, unit, 4g]","[91, 23]"


In [None]:
def encode_features(data):
    X = []
    y = []
    for record in data:
        # Example: Select and encode relevant features into a feature vector
        feature_vector = [
            record["PRICE"],
            record["BATH"],
            record["PROPERTYSQFT"],
            record["LATITUDE"],
            record["LONGITUDE"]
            # Add more numeric features as needed
        ]
        # Append text-based features (after processing) to the feature vector
        feature_vector.extend(record["BROKERTITLE"])
        feature_vector.extend(record["ADDRESS"])
        feature_vector.extend(record["LONG_NAME"])
        # Append feature vector to X
        X.append(feature_vector)
        # Append label (number of beds) to y (assuming it's the label)
        y.append(record["BATH"])  # Example: Using BATH as the label
    return np.array(X), np.array(y)

In [136]:
indices_train = np.random.permutation(train_data.shape[0])
for i in range(0, train_data.shape[0], 32):
    print(indices_train, len(indices_train))
    X_batch_train = train_data.iloc[indices_train[i:i+32]]
    print(X_batch_train)
    exit(1)

[ 545 2283  429 ...  972  404 2711] 3333
                                            BROKERTITLE  \
545               Brokered by Best Service Realty Corp.   
2283            Brokered by Robert DeFalco Realty, Inc.   
429   Brokered by Peter Ashe Real Estate- Peter Ashe...   
3023                      Brokered by EXIT REALTY PRIME   
1892     Brokered by KELLER WILLIAMS REALTY LANDMARK II   
2001            Brokered by Maureen Folan R E Group Llc   
2237         Brokered by Coldwell Banker American Homes   
799              Brokered by Acadia Real Estate Grp LLC   
1264  Brokered by Sotheby's International Realty - D...   
373                         Brokered by B Square Realty   
1379  Brokered by Brown Harris Stevens - 129 Montagu...   
1971        Brokered by Garfield, Leslie J. & Co., Inc.   
3072                            Brokered by RE MAX Edge   
1748         Brokered by Douglas Elliman - 140 Franklin   
160                      Brokered by Winzone Realty Inc   
2207           