In [26]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# The Data
in this dataset my goal is to train a random forest regression model with a subset of columns from this interesting dataset in order to predict the price of a used car.
https://www.kaggle.com/orgesleka/used-cars-database

In [27]:
# Reading the dataset
autos= pd.read_csv('autos.csv', encoding='latin-1')
autos.head()

Unnamed: 0,dateCrawled,name,seller,offerType,price,abtest,vehicleType,yearOfRegistration,gearbox,powerPS,model,kilometer,monthOfRegistration,fuelType,brand,notRepairedDamage,dateCreated,nrOfPictures,postalCode,lastSeen
0,2016-03-24 11:52:17,Golf_3_1.6,privat,Angebot,480,test,,1993,manuell,0,golf,150000,0,benzin,volkswagen,,2016-03-24 00:00:00,0,70435,2016-04-07 03:16:57
1,2016-03-24 10:58:45,A5_Sportback_2.7_Tdi,privat,Angebot,18300,test,coupe,2011,manuell,190,,125000,5,diesel,audi,ja,2016-03-24 00:00:00,0,66954,2016-04-07 01:46:50
2,2016-03-14 12:52:21,"Jeep_Grand_Cherokee_""Overland""",privat,Angebot,9800,test,suv,2004,automatik,163,grand,125000,8,diesel,jeep,,2016-03-14 00:00:00,0,90480,2016-04-05 12:47:46
3,2016-03-17 16:54:04,GOLF_4_1_4__3TÜRER,privat,Angebot,1500,test,kleinwagen,2001,manuell,75,golf,150000,6,benzin,volkswagen,nein,2016-03-17 00:00:00,0,91074,2016-03-17 17:40:17
4,2016-03-31 17:25:20,Skoda_Fabia_1.4_TDI_PD_Classic,privat,Angebot,3600,test,kleinwagen,2008,manuell,69,fabia,90000,7,diesel,skoda,nein,2016-03-31 00:00:00,0,60437,2016-04-06 10:17:21


In [28]:
data.shape

(768, 9)

# Task 1 : Data Processing
first i will remove the columns that are useless in the price prediction of the vehicles from the data set.

In [29]:
autos=autos.drop(['seller','offerType','dateCrawled','dateCreated','lastSeen','nrOfPictures','postalCode'],axis=1)
print(autos.shape)
autos.head()

(371528, 13)


Unnamed: 0,name,price,abtest,vehicleType,yearOfRegistration,gearbox,powerPS,model,kilometer,monthOfRegistration,fuelType,brand,notRepairedDamage
0,Golf_3_1.6,480,test,,1993,manuell,0,golf,150000,0,benzin,volkswagen,
1,A5_Sportback_2.7_Tdi,18300,test,coupe,2011,manuell,190,,125000,5,diesel,audi,ja
2,"Jeep_Grand_Cherokee_""Overland""",9800,test,suv,2004,automatik,163,grand,125000,8,diesel,jeep,
3,GOLF_4_1_4__3TÜRER,1500,test,kleinwagen,2001,manuell,75,golf,150000,6,benzin,volkswagen,nein
4,Skoda_Fabia_1.4_TDI_PD_Classic,3600,test,kleinwagen,2008,manuell,69,fabia,90000,7,diesel,skoda,nein


In [30]:
autos.describe()

Unnamed: 0,price,yearOfRegistration,powerPS,kilometer,monthOfRegistration
count,371528.0,371528.0,371528.0,371528.0,371528.0
mean,17295.14,2004.577997,115.549477,125618.688228,5.734445
std,3587954.0,92.866598,192.139578,40112.337051,3.712412
min,0.0,1000.0,0.0,5000.0,0.0
25%,1150.0,1999.0,70.0,125000.0,3.0
50%,2950.0,2003.0,105.0,150000.0,6.0
75%,7200.0,2008.0,150.0,150000.0,9.0
max,2147484000.0,9999.0,20000.0,150000.0,12.0


In [31]:
#control if we have missing data (nan) in dataset 
autos.isnull().sum().to_frame('nulls')

Unnamed: 0,nulls
name,0
price,0
abtest,0
vehicleType,37869
yearOfRegistration,0
gearbox,20209
powerPS,0
model,20484
kilometer,0
monthOfRegistration,0


As we have seen, vehicleType, gearbox, model, fuelType and notRepairedDamage columns contain missing data . with Most Frequncy Value and Interpolation methods we will try to complete this missing data.

In [32]:
# complete missing data with most frequncy value
model=autos["model"].value_counts()
print(model)
autos["model"].fillna("golf",inplace=True)

golf               30070
andere             26400
3er                20567
polo               13092
corsa              12573
                   ...  
serie_2                8
rangerover             6
serie_3                4
serie_1                2
discovery_sport        1
Name: model, Length: 251, dtype: int64


In [33]:
# complete missing data with most frequncy value
fuelType=autos["fuelType"].value_counts()
print(fuelType)
autos["fuelType"].fillna("benzin",inplace=True)

benzin     223857
diesel     107746
lpg          5378
cng           571
hybrid        278
andere        208
elektro       104
Name: fuelType, dtype: int64


In [34]:
# complete missing data with most frequncy value
notRepairedDamage=autos["notRepairedDamage"].value_counts()
print(notRepairedDamage)
autos["notRepairedDamage"].fillna("nein",inplace=True)

nein    263182
ja       36286
Name: notRepairedDamage, dtype: int64


In [35]:
# We will use the Interpolation method in gearbox and vehicleType columns, (I'll explain the reason later.)
#but  to use the Interpolation method, we need to convert gearbox and vehicleType columns from object to category.

autos= autos.astype({"name":'category',"abtest":'category',"vehicleType":'category',"gearbox":'category',"model":'category',"fuelType":'category',"brand":'category',"notRepairedDamage":'category'}) 
autos.dtypes
gearbox=autos["gearbox"].value_counts()
print(gearbox)

manuell      274214
automatik     77105
Name: gearbox, dtype: int64


There are two types of gearbox but frequency of manuell is higher than automatik but we cannot fill nan values with maxFreq value ,so i will use Interpolation method to complete nans

In [36]:
autos["gearbox"]=(autos["gearbox"].cat.codes.replace(-1, np.nan).interpolate().astype(int).astype('category').cat.rename_categories(autos["gearbox"].cat.categories))

In [37]:
vehicleType=autos["vehicleType"].value_counts()
print(vehicleType)

limousine     95894
kleinwagen    80023
kombi         67564
bus           30201
cabrio        22898
coupe         19015
suv           14707
andere         3357
Name: vehicleType, dtype: int64


here no particular value is more frequent so i will use Interpolation method

In [38]:
autos["vehicleType"]=autos["vehicleType"].interpolate(method='pad')
#Delete line 1 min./From the interpolation process, only the nan in the first line remained. So I deleted it.
autos=autos.drop([0],axis=0)

In [39]:
print(len(autos))
autos=autos[(autos.price > 100) & (autos.price < 200000) ]
print(len(autos))

371527
357003


In [40]:
autos.isnull().sum().to_frame('nulls')

Unnamed: 0,nulls
name,0
price,0
abtest,0
vehicleType,0
yearOfRegistration,0
gearbox,0
powerPS,0
model,0
kilometer,0
monthOfRegistration,0


### Data LabelEncoder and StandardScaler

In [41]:
#we will do LabelEncoder to feature data(x) .
x_data=autos.drop('price',axis=1)
#label encoding
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
encoded_data=x_data.apply(LabelEncoder().fit_transform) #tüm veriler encod edildi 
encoded_data.head()

Unnamed: 0,name,abtest,vehicleType,yearOfRegistration,gearbox,powerPS,model,kilometer,monthOfRegistration,fuelType,brand,notRepairedDamage
1,3806,1,3,99,1,190,118,11,5,3,1,0
2,86341,1,7,92,0,163,119,11,8,3,14,1
3,73230,1,4,89,1,75,118,12,6,1,38,1
4,162875,1,4,96,1,69,103,9,7,3,31,1
5,26062,1,6,83,1,102,11,12,10,1,2,0


In [42]:
#Make StandardScaler to feature data(x) data
from sklearn.preprocessing import StandardScaler
sc1=StandardScaler()
X=pd.DataFrame(sc1.fit_transform(encoded_data))
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,-1.659527,0.964455,-0.824621,0.985113,0.545845,1.079217,0.231478,0.18293,-0.224151,1.256385,-1.470376,-3.117047
1,-0.379917,0.964455,1.523398,0.066578,-1.83202,0.696121,0.245736,0.18293,0.592652,1.256385,-0.497459,0.320816
2,-0.583188,0.964455,-0.237617,-0.32708,0.545845,-0.552487,0.231478,0.534075,0.048117,-0.635043,1.298694,0.320816
3,0.806654,0.964455,-0.237617,0.591455,0.545845,-0.63762,0.017599,-0.519359,0.320385,1.256385,0.774816,0.320816
4,-1.314474,0.964455,0.936393,-1.114395,0.545845,-0.169392,-1.294189,0.534075,1.137187,-0.635043,-1.395536,-3.117047


In [47]:
x = X.iloc[:,:].values       #features (x)
y= autos.iloc[:,1:2].values   #target (price)

 ## Split the data into a Training and Testing set

In [48]:
# splitting dataset to train and test
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y,test_size=0.33, random_state = 0)

# Task 2: Regression

In [49]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor()
rfr.fit(x_train, y_train)

  This is separate from the ipykernel package so we can avoid doing imports until


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [50]:
#algoritmeyi test et
y_pred=rfr.predict(x_test)
print('score: %.2f' % rfr.score(x_test, y_test))  #score 

score: 0.80


In [52]:
#here we will do a comparison between real and predicted data.
print('Real Data')
print(pd.DataFrame(y_test).head(7))
print('predicted Data')
print(pd.DataFrame(y_pred).head(7))

Real Data
      0
0  2500
1  1200
2  2000
3  4990
4  1580
5  4250
6  5999
predicted Data
        0
0  1579.0
1  1364.7
2  1527.9
3  9050.3
4  2090.4
5  4419.8
6  6324.6


# Task 3: Classification

here i will use another dataset for Classification task ,in this dataset i will  build a machine learning model to accurately predict whether or not the patients in the dataset have diabetes or not .

https://www.kaggle.com/uciml/pima-indians-diabetes-database

In [53]:
autos= pd.read_csv('diabetes.csv', encoding='latin-1')
autos.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [54]:
x=data.drop(['Outcome'],axis=1)
y= data.iloc[:,8:]

### Split the data into a Training and Testing set

In [56]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y,test_size=0.33, random_state = 0)

In [57]:
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
X_train=sc.fit_transform(x_train)
X_test=sc.transform(x_test)

## Classification Model

In [58]:

#Logistic Regression
from sklearn.linear_model import LogisticRegression
logr=LogisticRegression(random_state=0)
logr.fit(X_train,y_train)
y_pred=logr.predict(X_test)


  y = column_or_1d(y, warn=True)


In [60]:
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_test,y_pred)
print("Logistic Regression Results ")
print(cm)

print('score: %.2f' % logr.score(X_test, y_test))  #score 


Logistic Regression Results 
[[154  16]
 [ 40  44]]
score: 0.78


# Task 4: Recommendation Systems

For Recommendation Systems i will use anime movie dataset on kaggle site to predict users rating to movies<br>
https://www.kaggle.com/CooperUnion/anime-recommendations-database

In [67]:
from surprise import Reader, Dataset,SVD ,SVDpp, SlopeOne, NMF, NormalPredictor, KNNBaseline,KNNBasic , KNNWithMeans, KNNWithZScore, BaselineOnly, CoClustering
import pandas as pd
from surprise.model_selection import cross_validate
from surprise.prediction_algorithms import knns
from surprise.similarities import cosine, msd
from surprise import accuracy

#reading data
rating = pd.read_csv('rating.csv',encoding = "latin-1")
anime = pd.read_csv('anime.csv') 


In [68]:
df_rating1 = pd.merge(rating,anime.drop('rating',axis=1),on='anime_id')
df_rating1.head()

Unnamed: 0,user_id,anime_id,rating,name,genre,type,episodes,members
0,1,20,-1,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,220,683297
1,3,20,8,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,220,683297
2,5,20,6,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,220,683297
3,6,20,-1,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,220,683297
4,10,20,-1,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,220,683297


In [69]:
df_rating1.rating.unique()

array([-1,  8,  6,  9, 10,  7,  5,  4,  1,  3,  2], dtype=int64)

In [71]:
#we have -1 in rating column so i will remove it 
df_rating=df_rating1[df_rating1.rating!=-1]
df_rating.head()

Unnamed: 0,user_id,anime_id,rating,name,genre,type,episodes,members
1,3,20,8,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,220,683297
2,5,20,6,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,220,683297
5,21,20,8,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,220,683297
6,28,20,9,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,220,683297
7,34,20,9,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,220,683297


In [73]:
veriler=df_rating.iloc[:,:]

reader = Reader(rating_scale=(1, 10))# ratings 1 ile 10 

data =Dataset.load_from_df(veriler[['user_id', 'anime_id', 'rating']], reader)


In [74]:
trainset = data.build_full_trainset() 
sim_msd= {'name':'msd', 'user_based':False} # name =  similarity modul 
                                            # compute  similarities between items  
knn_means = knns.KNNWithMeans(sim_options=sim_msd) 
knn_means.fit(trainset)  #fit model

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x194040d7e48>

In [75]:
#test algorithm  
# user_id, film_id , real rating
print(knn_means.predict(252, 20, 6))
print(knn_means.predict(282, 226, 10))
print(knn_means.predict(73099, 79, 7))
print(knn_means.predict(10581, 24, 10))
print(knn_means.predict(27317, 355, 7))


user: 252        item: 20         r_ui = 6.00   est = 7.67   {'actual_k': 22, 'was_impossible': False}
user: 282        item: 226        r_ui = 10.00   est = 9.55   {'actual_k': 40, 'was_impossible': False}
user: 73099      item: 79         r_ui = 7.00   est = 6.19   {'actual_k': 40, 'was_impossible': False}
user: 10581      item: 24         r_ui = 10.00   est = 10.00   {'actual_k': 1, 'was_impossible': False}
user: 27317      item: 355        r_ui = 7.00   est = 6.91   {'actual_k': 27, 'was_impossible': False}


#For movie with ID 20, we get an estimated prediction of 7.67