In [1]:
# Importing the libraries
import numpy as np
#import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn import metrics


In [2]:
# Importing the dataset
dataset = pd.read_csv('data.csv', sep = ';') # your datase. if csv sepaartor is , then sep = ';' is not needed
print(dataset.head())


   R&D Spend  Administration  Marketing Spend       State     Profit
0  165349.20       136897.80        471784.10    New York  192261.83
1  162597.70       151377.59        443898.53  California  191792.06
2  153441.51       101145.55        407934.54     Florida  191050.39
3  144372.41       118671.85        383199.62    New York  182901.99
4  142107.34        91391.77        366168.42     Florida  166187.94


In [3]:
# you can encode your categorical data with label encoder, but you might want to just simply drop it
#dataset.drop(['State'], axis=1)

# encodig categorical data

X = dataset.iloc[:, :-1].values  # stuff from what you want to make a prediction
y = dataset.iloc[:, 4].values   # Target variable
labelencoder_X = LabelEncoder()
X[:,3] = labelencoder_X.fit_transform(X[:,3])
onehotencoder = OneHotEncoder(categorical_features = [3])
X = onehotencoder.fit_transform(X).toarray()

In [4]:
X.shape

(50, 6)

In [5]:
X[:5] # head of X

array([[  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          1.65349200e+05,   1.36897800e+05,   4.71784100e+05],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          1.62597700e+05,   1.51377590e+05,   4.43898530e+05],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          1.53441510e+05,   1.01145550e+05,   4.07934540e+05],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          1.44372410e+05,   1.18671850e+05,   3.83199620e+05],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          1.42107340e+05,   9.13917700e+04,   3.66168420e+05]])

In [6]:
# about onehotencoding : https://machinelearningmastery.com/how-to-one-hot-encode-sequence-data-in-python/

# Avoiding dummy variable trap: https://www.algosome.com/articles/dummy-variable-trap-regression.html
X = X[:, 1:] #basiccaly we delete the first col
X[:5] 

array([[  0.00000000e+00,   1.00000000e+00,   1.65349200e+05,
          1.36897800e+05,   4.71784100e+05],
       [  0.00000000e+00,   0.00000000e+00,   1.62597700e+05,
          1.51377590e+05,   4.43898530e+05],
       [  1.00000000e+00,   0.00000000e+00,   1.53441510e+05,
          1.01145550e+05,   4.07934540e+05],
       [  0.00000000e+00,   1.00000000e+00,   1.44372410e+05,
          1.18671850e+05,   3.83199620e+05],
       [  1.00000000e+00,   0.00000000e+00,   1.42107340e+05,
          9.13917700e+04,   3.66168420e+05]])

In [7]:
X.shape

(50, 5)

In [8]:
# Splitting the dataset into the Training set and Test set
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)



In [9]:
"""
We don't need feature scaling, python modul takes care of this automatically
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

"""

# Fitting regression model
# itt SVR model van
regressor = LinearRegression()
#regressor = SVR(kernel = 'rbf')
regressor.fit(X_train, y_train) # fit the regressor to the training set
                                 # basically "machine learning"

# Predicting the test results
y_pred = regressor.predict(X_test) # ezeket prediktálja, majd ezeket lehet összehasonlítani a y_test-el, 
                                #hogy megmondja mennyire jó a modellünk

In [10]:
# We evaluate our model: https://towardsdatascience.com/linear-regression-in-python-9a1f5f000606 
print("Mean absolute error (MAE): {}".format(metrics.mean_absolute_error(y_test, y_pred)))
print("Mean squared error (MSE): {}".format(metrics.mean_squared_error(y_test, y_pred)))
print("Root mean squared error (RMSE): {}".format(np.sqrt(metrics.mean_absolute_error(y_test, y_pred))))

Mean absolute error (MAE): 6520.697183080078
Mean squared error (MSE): 61903144.4023628
Root mean squared error (RMSE): 80.75083394665394


In [11]:
y_pred

array([ 104282.76472172,  132536.88499212,  133910.85007767,
         72584.77489417,  179920.92761891,  114549.31079233,
         66444.43261346,   98404.96840122,  114499.82808602,
        169367.50639896,   96522.62539981,   88040.6718287 ,
        110949.99405526,   90419.1897851 ,  128020.46250064])

In [12]:
# test your regressor with some total new data 
new_vector = np.array([0, 0, 16000, 130000, 500000]).reshape((1, 5))

In [13]:
new_vector

array([[     0,      0,  16000, 130000, 500000]])

In [14]:
y_pred = regressor.predict(new_vector)
y_pred

array([ 74490.32941474])