In [1]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import linear_model
from sklearn.model_selection import train_test_split
import numpy as np
import seaborn as sb
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [None]:
!wget https://raw.githubusercontent.com/vanessagiovani/machinelearning/main/listings.csv

In [4]:
data = pd.read_csv("listings.csv")
print(data.dtypes)
print(data.isna().values.any())

id                                  int64
name                               object
host_id                             int64
host_name                          object
neighbourhood_group                object
neighbourhood                      object
latitude                          float64
longitude                         float64
room_type                          object
price                               int64
minimum_nights                      int64
number_of_reviews                   int64
last_review                        object
reviews_per_month                 float64
calculated_host_listings_count      int64
availability_365                    int64
dtype: object
True


In [5]:
newData = data[["price", "number_of_reviews"]]
print(newData.head())

   price  number_of_reviews
0     83                  1
1     81                 18
2     69                 20
3    206                 14
4     94                 22


train data linear regression

In [6]:
train, test = train_test_split(newData, test_size=0.2)
regression = linear_model.LinearRegression()
regression.fit(train[["price"]], train[["number_of_reviews"]])
print('Coefficients (theta 1): ', regression.coef_)
print('Intercept (theta 0): ',regression.intercept_)

Coefficients (theta 1):  [[-0.00397124]]
Intercept (theta 0):  [13.32116741]


In [7]:
prediction = regression.predict(test[["price"]])
for i in range(len(test)):
  print(test[["price"]].values[i], prediction[i])

print("MAE : ", mean_absolute_error(test[["number_of_reviews"]], prediction))
print("MSE : ", mean_squared_error(test[["number_of_reviews"]], prediction))
print("R2 : ", r2_score(test[["number_of_reviews"]], prediction))

[115] [12.86447518]
[58] [13.09083567]
[100] [12.92404373]
[287] [12.18142245]
[35] [13.18217412]
[100] [12.92404373]
[125] [12.82476281]
[140] [12.76519426]
[50] [13.12260557]
[218] [12.45543779]
[110] [12.88433136]
[110] [12.88433136]
[90] [12.9637561]
[89] [12.96772733]
[39] [13.16628917]
[303] [12.11788266]
[410] [11.69296032]
[60] [13.0828932]
[400] [11.73267269]
[192] [12.55868994]
[100] [12.92404373]
[319] [12.05434287]
[25] [13.22188649]
[85] [12.98361228]
[139] [12.76916549]
[550] [11.13698717]
[65] [13.06303701]
[110] [12.88433136]
[169] [12.65002839]
[222] [12.43955284]
[90] [12.9637561]
[50] [13.12260557]
[69] [13.04715207]
[187] [12.57854613]
[106] [12.90021631]
[208] [12.49515015]
[128] [12.8128491]
[187] [12.57854613]
[75] [13.02332465]
[268] [12.25687595]
[140] [12.76519426]
[200] [12.52692005]
[60] [13.0828932]
[165] [12.66591334]
[231] [12.40381171]
[347] [11.94314824]
[50] [13.12260557]
[137] [12.77710797]
[151] [12.72151065]
[125] [12.82476281]
[60] [13.0828932]
[15

KNN classification

In [8]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, mean_squared_error, mean_absolute_error
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

In [9]:
data = pd.read_csv("listings.csv")

for i in data:
  print(i, data[i].dtypes)
  if data[i].isna().values.any() and data[i].dtypes != object:
      data[i] = data[i].fillna(data[i].mean())

train, test = train_test_split(data, test_size=0.2)

id int64
name object
host_id int64
host_name object
neighbourhood_group object
neighbourhood object
latitude float64
longitude float64
room_type object
price int64
minimum_nights int64
number_of_reviews int64
last_review object
reviews_per_month float64
calculated_host_listings_count int64
availability_365 int64


train, classify data, and indetify K

In [11]:
KNN = KNeighborsClassifier(n_neighbors = 3).fit(train[["price", "minimum_nights"]], train["number_of_reviews"])

newClassification = KNN.predict([[17,8],[8,10]])
print(newClassification)

classification = KNN.predict(test[["price", "minimum_nights"]])
accuracy = accuracy_score(test["number_of_reviews"], classification)
MAE = mean_absolute_error(test["number_of_reviews"], classification)
MSE = mean_squared_error(test["number_of_reviews"], classification)

print(" ACC : %.2f" % accuracy)
print(" MAE : %.2f" % MAE)
print(" MSE : %.2f" % MSE)

[3 1]
 ACC : 0.30
 MAE : 14.16
 MSE : 1314.89


In [12]:
Ks = 10
accuracy = np.zeros((Ks-1))
ConfustionMx = [];
for n in range(1, Ks):    
    KNN = KNeighborsClassifier(n_neighbors = n).fit(train[["price", "minimum_nights"]], train["number_of_reviews"])  
    classification = KNN.predict(test[["price", "minimum_nights"]])
    accuracy[n - 1] = accuracy_score(test["number_of_reviews"], classification)
    
print("Best  ACC : %.2f" % accuracy.max(), ", with k = ", accuracy.argmax() + 1)

Best  ACC : 0.32 , with k =  9


logistic regression classification

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, f1_score

train and classify data

In [15]:
LGR = LogisticRegression(C = 0.01, solver = 'liblinear').fit(train[["price", "minimum_nights"]], train["number_of_reviews"])

newClassification = LGR.predict([[17,8],[8,10]])
print(newClassification)

classification = LGR.predict(test[["price", "minimum_nights"]])
accuracy = accuracy_score(test["number_of_reviews"], classification)
MAE = mean_absolute_error(test["number_of_reviews"], classification)
MSE = mean_squared_error(test["number_of_reviews"], classification)
F1  = f1_score(test["number_of_reviews"], classification, average='weighted') 
cnf_matrix = confusion_matrix(test["number_of_reviews"], classification, labels=[0,1])


print(" ACC : %.2f" % accuracy)
print(" MAE : %.2f" % MAE)
print(" MSE : %.2f" % MSE)
print(" F1  : %.2f" % F1)
print (classification_report(test["number_of_reviews"], classification))
print(cnf_matrix)

[0 0]
 ACC : 0.36
 MAE : 14.35
 MSE : 1374.13
 F1  : 0.19
              precision    recall  f1-score   support

           0       0.36      1.00      0.52       562
           1       0.00      0.00      0.00       215
           2       0.00      0.00      0.00       105
           3       0.00      0.00      0.00        77
           4       0.00      0.00      0.00        53
           5       0.00      0.00      0.00        47
           6       0.00      0.00      0.00        31
           7       0.00      0.00      0.00        17
           8       0.00      0.00      0.00        25
           9       0.00      0.00      0.00        25
          10       0.00      0.00      0.00        23
          11       0.00      0.00      0.00        23
          12       0.00      0.00      0.00        12
          13       0.00      0.00      0.00        21
          14       0.00      0.00      0.00        15
          15       0.00      0.00      0.00         6
          16       0.00

  _warn_prf(average, modifier, msg_start, len(result))


probability each class

In [16]:
classificationProb = LGR.predict_proba(test[["price", "minimum_nights"]])
print(classificationProb)

[[8.62611212e-01 9.54467561e-02 1.99239691e-02 ... 6.17217582e-54
  1.99270888e-49 4.67926357e-54]
 [4.07887889e-01 1.86660953e-01 1.12411358e-01 ... 2.38671614e-13
  2.52033389e-12 2.13071475e-13]
 [5.44622158e-01 1.71968907e-01 1.04200831e-01 ... 2.17872452e-07
  8.14463852e-08 7.17601750e-08]
 ...
 [4.20408348e-01 1.86936562e-01 1.16504399e-01 ... 5.77831090e-11
  2.60164115e-10 4.35121052e-11]
 [1.47021328e-01 8.76286341e-02 6.31962304e-02 ... 1.35027549e-06
  3.87460395e-06 1.30447500e-06]
 [7.83169767e-01 1.34234526e-01 6.23098222e-02 ... 9.28110656e-12
  8.28471749e-13 1.01556376e-12]]
