# Feature Extraction

##### Author: Jim Ren

In [1]:
import re, json, random

In [2]:
features_file = open('features.txt', 'r')
features = features_file.read().lower()
features = re.sub(';$', '', features)
features = re.split(';', features)
print features

['downtown berkeley', 'studio', 'bart', 'gourmet ghetto', 'laundry', 'fireplace', 'hardwood', 'carpet', 'remodeled', 'smoke free', 'fully furnished', 'soda hall', 'no pet', 'no bike', 'bathtub', 'dishwasher', 'garage', 'carpeted floor', 'granite', 'garbage disposal', 'free wifi', 'elevator', 'bike rack', 'balcony', 'no parking']


In [3]:
def parseRoomCount(string):
    if string == '1' or string == 'one':
        return 1
    elif string == '2' or string == 'two':
        return 2
    elif string == '3' or string == 'three':
        return 3
    elif string == '4' or string == 'four':
        return 4
    elif string == '5' or string == 'five':
        return 5
    elif string == '6' or string == 'six':
        return 6
    elif string == '7' or string == 'seven':
        return 7
    elif string == '8' or string == 'eight':
        return 8
    elif string == '9' or string == 'nine':
        return 9
    elif string == '0' or string == 'ten':
        return 0
    else:
        return None

In [9]:
from math import sin, cos, sqrt, atan2, radians

def latlon_dist(lat2, lon2):
    # approximate radius of earth in km
    R = 6373.0

    lat1 = radians(37.8719)
    lon1 = radians(-122.2585)
    lat2 = radians(lat2)
    lon2 = radians(lon2)

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    distance = R * c
    return distance * 0.621371

print(latlon_dist(37.864153, -122.259464))

0.538011015371


In [120]:
design_matrix = []
cooked_labels = []
all_data = []
postings = []
ct = 0
for asdf in range(0, 10):
    json_file = open('data'+str(asdf*100)+'.json', 'r')
    input_data = json.load(json_file)
    for posting in input_data:
        details = input_data[posting]
        description = details['features'].lower()

        # Count binary features for existence
        sample_features = []
        for feature in features:
            if feature in description:
                sample_features.append(1)
            else:
                sample_features.append(0)


        # Count quantitative features
        # Bedroom count
        bedroom_count = -1
        bedroom_keywords = re.findall('\S+?(?=\sbedroom|\sbed|\sbr)s?|(?<=bedroom)s?.+?\S+', description)
        if bedroom_keywords:
            for i in bedroom_keywords:
                count = parseRoomCount(i)
                if count != None:
                    bedroom_count = count
                    break
        sample_features.append(bedroom_count)
        # Bathroom count
        bathroom_count = -1
        bathroom_keywords = re.findall('\S+?(?=\sbathroom|\sbath|\sba)s?|(?<=bathroom)s?.+?\S+', description)
        if bathroom_keywords:
            for i in bathroom_keywords:
                count = parseRoomCount(i)
                if count != None:
                    bathroom_count = count
                    break
        sample_features.append(bathroom_count)
        # Square feet
        square_feet = -1
        square_feet_keywords = re.findall('\d+(?=ft2)', details['size'])
        if square_feet_keywords:
            square_feet = float(square_feet_keywords[0])
        sample_features.append(square_feet)

        # Add other numerical values
        # Price
        if details['price'] == 'n/a':
            continue
        price = float(re.sub('^\$', '', details['price']))
        sample_features.append(price)
        # Longitude
        try:
            longitude = float(details['longitude'])
        except ValueError:
            longitude = 360.0
            continue
        sample_features.append(longitude)
        # Latitude
        try:
            latitude = float(details['latitude'])
        except ValueError:
            latitude = 180.0
            continue
        sample_features.append(latitude)
        dist = latlon_dist(latitude, longitude)
        sample_features.append(dist)
        
        if bedroom_count == -1:
            continue

        design_matrix.append(sample_features)
        # Cooking training data
        if bedroom_count >= 2 and bedroom_count <= 3 and price <= 4000 and price >= 2300 and dist < 2:
            if random.random() < 1:
                cooked_labels.append(1)
                ct+=1
            else:
                cooked_labels.append(0)
        else:
            cooked_labels.append(0)
        postings.append(posting)
        all_data.append(details)
    print asdf, ct
print len(cooked_labels)
labels = cooked_labels

0 9
1 17
2 21
3 33
4 45
5 55
6 63
7 74
8 83
9 84
444


In [91]:
print design_matrix

[[0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, -1, 2255.0, -122.260344, 37.877106, 0.37361198844163324], [0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, -1, 2595.0, -122.265387, 37.874582, 0.4189733515480534], [0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 3, -1, 1100.0, 3100.0, -122.307422, 37.911396, 3.8173179441096323], [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 2, -1, 1350.0, 3500.0, -122.286158, 37.843241, 2.4902464500284003], [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, -1, 818.0, 3034.0, -122.288969, 37.839369, 2.7963907433340007], [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, -1, 640.0, 1850.0, -122.252601, 37.833289, 2.687946357957304], [0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, -1, 1423.0, 4982.0, -122.28899, 37.83936, 2.797572379374738], [0, 0,

In [92]:
# Cooking training data
cooked_file = open('rnd_preference.csv', 'w')
cooked_file.write('Link,Label\n')
i = 0
for posting in postings:
    cooked_file.write(str(posting) + ',' + str(cooked_labels[i]) + '\n')
    i += 1
cooked_file.close()

In [105]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model, svm, datasets, cross_validation
%matplotlib inline

In [94]:
X = np.array(design_matrix)
y = np.array(labels)

In [95]:
logr = linear_model.LogisticRegressionCV()
logr.fit(X, y)
print logr.C_

[ 0.0001]


In [96]:
probs = logr.predict_proba(X)[:, 1]
np.sum(np.abs(probs - y))

128.82071579317821

In [115]:
S = svm.SVC(class_weight = "balanced", probability = True)
S.fit(X, y)
probs = S.predict_proba(X)[:, 1]

In [119]:
order = np.argsort(probs)
posts = np.array(postings)
print posts[order[:10]], posts[order[-10:]]
print np.array(labels)[order[-10:]]
print np.array(labels)[order[:10]]
print probs[order[-10:]], probs[order[:10]]

[u'https://sfbay.craigslist.org/eby/apa/5501823690.html'
 u'https://sfbay.craigslist.org/eby/apa/5503520226.html'
 u'https://sfbay.craigslist.org/eby/apa/5473978305.html'
 u'https://sfbay.craigslist.org/eby/apa/5506203549.html'
 u'https://sfbay.craigslist.org/eby/apa/5506198514.html'
 u'https://sfbay.craigslist.org/eby/apa/5509559468.html'
 u'https://sfbay.craigslist.org/eby/apa/5482809697.html'
 u'https://sfbay.craigslist.org/eby/apa/5514486605.html'
 u'https://sfbay.craigslist.org/eby/apa/5487551600.html'
 u'https://sfbay.craigslist.org/eby/apa/5519361835.html'] [u'https://sfbay.craigslist.org/eby/apa/5512455140.html'
 u'https://sfbay.craigslist.org/eby/apa/5512622882.html'
 u'https://sfbay.craigslist.org/eby/apa/5519224401.html'
 u'https://sfbay.craigslist.org/eby/apa/5504915723.html'
 u'https://sfbay.craigslist.org/eby/apa/5493056353.html'
 u'https://sfbay.craigslist.org/eby/apa/5507121228.html'
 u'https://sfbay.craigslist.org/eby/apa/5508361510.html'
 u'https://sfbay.craigslist.or

In [129]:
import json
order = np.argsort(probs)[::-1]
with open('data.json', 'w') as outfile:
    json.dump({"probs": probs[order].tolist(), 
               "design_matrix": X[order].tolist(), 
               "postings": np.array(all_data)[order].tolist()}, outfile)