In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import mean_squared_error

In [3]:
airbnb = pd.read_csv('listings.csv')
#airbnb
## Data Cleaning of CSV
df = pd.read_csv("listings.csv")
# Drop the null columns where all values are null
# df = df.dropna(axis='columns', how='all')
# Drop the null rows
# df = df.dropna()
df.head()


Unnamed: 0,id,name,host_id,host_name,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2384,"Hyde Park - Walk to UChicago, 10 min to McCormick",2613,Rebecca,Hyde Park,41.7879,-87.5878,Private room,60,2,178,12/15/19,2.56,1,353
1,4505,394 Great Reviews. 127 y/o House. 40 yds to tr...,5775,Craig & Kathleen,South Lawndale,41.85495,-87.69696,Entire home/apt,105,2,395,7/14/20,2.81,1,155
2,7126,Tiny Studio Apartment 94 Walk Score,17928,Sarah,West Town,41.90289,-87.68182,Entire home/apt,60,2,384,3/8/20,2.81,1,321
3,9811,Barbara's Hideaway - Old Town,33004,At Home Inn,Lincoln Park,41.91769,-87.63788,Entire home/apt,65,4,49,10/23/19,0.63,9,300
4,10610,3 Comforts of Cooperative Living,2140,Lois,Hyde Park,41.79612,-87.59261,Private room,21,1,44,2/14/20,0.61,5,168


In [4]:
df.nlargest(20, ['number_of_reviews'])

Unnamed: 0,id,name,host_id,host_name,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
686,10069247,Shared Artist Loft Own Private Room,51668477,Michael,Near North Side,41.88924,-87.63032,Private room,102,1,632,9/20/20,10.97,8,329
49,464581,"Large, Private Logan Square Studio",2308792,Jonathan,Logan Square,41.92451,-87.69385,Entire home/apt,60,2,629,9/17/20,6.39,1,127
181,2570620,"Basement Studio w/Free St. Parking, Humboldt Park",1214607,Coral,Humboldt Park,41.89826,-87.70336,Entire home/apt,45,1,552,9/19/20,7.16,2,147
95,1171860,"Lincoln Park Studio, Great Value!",2658212,James,Lincoln Park,41.92628,-87.64026,Entire home/apt,99,30,541,7/29/20,6.16,4,96
419,6504170,West Town Traveler's Lodge,4591141,Moudi,West Town,41.90064,-87.67056,Entire home/apt,106,7,511,6/10/20,7.9,2,23
26,189821,"Best in Chicago, private, amazing garden space",899757,Meighan,Logan Square,41.92918,-87.70219,Entire home/apt,201,3,506,9/12/20,4.61,1,134
96,1183249,"Boystown Studio, Outstanding Value!",2658212,James,Lake View,41.9395,-87.64483,Entire home/apt,99,26,504,7/24/20,5.67,4,31
44,350347,"Luxury Chicago Loft, 1 blk to train w/ 2 pkg s...",845242,Jay & Elisa,Lake View,41.93986,-87.65473,Entire home/apt,128,3,499,9/6/20,4.79,1,313
214,3061036,Private Room-sunny loft. McCormick place/down...,1876022,Leslie,Near South Side,41.85298,-87.62761,Private room,85,1,499,9/13/20,6.46,1,152
618,8706915,"Polk Street Coach House Apartment, Little Ital...",43282155,Ken,Near West Side,41.87186,-87.66469,Entire home/apt,98,2,491,9/13/20,8.38,1,279


In [5]:
list(df.columns)

['id',
 'name',
 'host_id',
 'host_name',
 'neighbourhood',
 'latitude',
 'longitude',
 'room_type',
 'price',
 'minimum_nights',
 'number_of_reviews',
 'last_review',
 'reviews_per_month',
 'calculated_host_listings_count',
 'availability_365']

In [6]:
def priceType(price):
    if(price > 150):
        return  "A"
    if(price > 100):
        return "B"
    if(price > 60):
        return "c"
    return "D"

In [7]:
df["price"].map(priceType).value_counts()

c    1480
A    1369
B    1192
D    1152
Name: price, dtype: int64

In [8]:
y= df["price"].map(priceType)

In [9]:
X = df[["neighbourhood","room_type","minimum_nights","number_of_reviews","reviews_per_month", "availability_365"]]

In [10]:
cat_col  = X.select_dtypes(include='object').columns.to_list()
num_col  = X.select_dtypes(exclude='object').columns.to_list()

In [11]:
X_train ,X_test ,y_train,y_test = train_test_split(X,y,stratify=y,random_state=42)

In [12]:
modle = GradientBoostingClassifier(n_estimators=5000 ,learning_rate=0.002)

In [13]:
categorical_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
numerical_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ("anyThing",StandardScaler())
])
preprocessing = ColumnTransformer(
    [('cat', categorical_pipe, cat_col),
     ('num', numerical_pipe, num_col)],
)
rf = Pipeline([
    ('preprocess', preprocessing),
    ('feature_selection', SelectFromModel(modle, prefit=False)),
    ('classifier', modle)
])

In [14]:
rf.fit(X_train,y_train)

Pipeline(steps=[('preprocess',
                 ColumnTransformer(transformers=[('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(fill_value='missing',
                                                                                 strategy='constant')),
                                                                  ('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['neighbourhood',
                                                   'room_type']),
                                                 ('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('a

In [15]:
rf.score(X_test,y_test)

0.5635103926096998

In [16]:
rf.score(X_train,y_train)

0.6142783769902413

In [17]:
prd = pd.DataFrame({
   "neighbourhood" : ["Logan Square"],
    #"latitude":[41.92451],
    #"longitude":[-87.69385],
    "room_type":["Private Room"],
    "minimum_nights" : [2],
    "number_of_reviews" : [100],
    "reviews_per_month":[2.5],
    "availability_365": [30]
})

In [18]:
rf.predict(prd)

array(['D'], dtype=object)

In [19]:
import pickle


with open('victoria.pkl', 'rb') as f:
    data = pickle.load(f)

In [20]:
data

'victoria.pkl'

In [21]:
type(data)

str

In [1]:
import joblib
import pickle
filename = 'victoria.pkl'
joblib.dump(modle, filename)

NameError: name 'modle' is not defined

In [None]:
with open('victoria.pkl', 'rb') as f:
    pickle.dump('victoria.pkl', f)

In [None]:
modle.score