# DATA PREPROCESSING USING STOCK PRICE DATASET

# Importing Libraries

In [28]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

# Importing Dataset

In [29]:
df=pd.read_csv('newyork.xlsx.csv')
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,29-06-2010,19.0,25.0,17.540001,23.889999,23.889999,18766300.0
1,30-06-2010,,30.42,23.299999,23.83,23.83,17187100.0
2,01-07-2010,25.0,,20.27,21.959999,21.959999,8218800.0
3,02-07-2010,23.0,23.1,18.709999,19.200001,19.200001,5139800.0
4,06-07-2010,20.0,20.0,15.83,,16.110001,6866900.0


In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2416 entries, 0 to 2415
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Date       2416 non-null   object 
 1   Open       2413 non-null   float64
 2   High       2412 non-null   float64
 3   Low        2413 non-null   float64
 4   Close      2412 non-null   float64
 5   Adj Close  2415 non-null   float64
 6   Volume     2414 non-null   float64
dtypes: float64(6), object(1)
memory usage: 132.2+ KB


In [31]:
df.describe()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume
count,2413.0,2412.0,2413.0,2412.0,2415.0,2414.0
mean,186.475906,189.541186,182.916457,186.676078,186.211449,5576549.0
std,118.671695,120.233121,116.657427,119.04647,118.785452,4988098.0
min,16.139999,16.629999,14.98,15.8,15.8,118500.0
25%,34.41,34.9675,33.709999,34.417499,34.400002,1910750.0
50%,213.100006,216.760002,208.919998,213.184998,212.960007,4582150.0
75%,266.450012,270.927513,262.100006,266.865006,266.724991,7361250.0
max,673.690002,653.0,673.52002,780.0,780.0,47065000.0


In [32]:
x=df.iloc[:,:3].values   #independent variables
x

array([['29-06-2010', 19.0, 25.0],
       ['30-06-2010', nan, 30.42],
       ['01-07-2010', 25.0, nan],
       ...,
       ['30-01-2020', 632.419983, 650.880005],
       ['31-01-2020', 640.0, 653.0],
       ['03-02-2020', 673.690002, nan]], dtype=object)

In [33]:
y=df['Close'].values    #dependent variables
y

array([ 23.889999,  23.83    ,  21.959999, ..., 640.809998, 650.570007,
       780.      ])

# Handling Missing Data

In [34]:
df.isnull().sum()

Date         0
Open         3
High         4
Low          3
Close        4
Adj Close    1
Volume       2
dtype: int64

In [37]:
#imputing by using mean strategy to fill out missing values
imputer=SimpleImputer(missing_values=np.nan,strategy='mean')
imputer=imputer.fit(x[:,1:3])
x[:,1:3]=imputer.transform(x[:,1:3])
x   

array([['29-06-2010', 19.0, 25.0],
       ['30-06-2010', 186.47590558806465, 30.42],
       ['01-07-2010', 25.0, 189.54118565008292],
       ...,
       ['30-01-2020', 632.419983, 650.880005],
       ['31-01-2020', 640.0, 653.0],
       ['03-02-2020', 673.690002, 189.54118565008292]], dtype=object)

# Encoding Categorical Data

In [41]:
LE=LabelEncoder()
x[:,0]=LE.fit_transform(x[:,0])
y=LE.fit_transform(y)
print(x,"\n","\n","\n",y)

[[2246 19.0 25.0]
 [2319 186.47590558806465 30.42]
 [33 25.0 189.54118565008292]
 ...
 [2301 632.419983 650.880005]
 [2374 640.0 653.0]
 [166 673.690002 189.54118565008292]] 
 
 
 [ 110  107   71 ... 2218 2219 2220]


# Splitting Up of Dataset into TRAINING & TESTING Data

In [47]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.4,random_state=1)

In [48]:
print(x_train,"\n\n\n",x_test) 

[[1663 352.0 353.100006]
 [1906 219.770004 222.5]
 [203 347.809998 355.0]
 ...
 [292 240.490005 242.350006]
 [185 29.950001 31.5]
 [1252 255.149994 262.459991]] 


 [[756 29.309999 29.940001]
 [1446 241.389999 241.550003]
 [928 364.48999 368.76001]
 ...
 [107 207.330002 209.350006]
 [136 313.950012 316.839996]
 [552 31.6 32.009998]]


In [49]:
print(y_train,"\n\n\n",y_test) 

[2015 1171 2085 ... 1366  344 1587] 


 [ 337 1340 2154  137 1479 1991 1178  946 1028 1749 2182  502 2162  378
 1689 1578  161  535  773  399 1536 1720  424   61    3 2038 1281  456
 1746   85  339 2049 2111 1455  306 1445  976 1911 1611  522  459  892
 2193  152 2221  174  417  922  806  496 1512 1513 1267 1482 1966 1040
 1424 1008  977  368  779 1936 1889  308 1670 1022  459  682  106 1797
 1884  407  591  840 1275  975  947 2026   23 1564 1014 1335 1180  563
 1545  882  464   47 1398 1042  933 1843 1983  524   30 2132 1572 2116
  697 1962 1734 1484 1528  512 1021 1939 1897 1975  539  479 2091  531
 1167 1620  710  366 1541 1643 1131  899   44 1799  284  125  569  365
 1094 1521  448   34 1209 2144 1187 1657 1566  540 1601   55  791 1111
 1200 1496 1497   45 1658  350 1394 1568  347 1519  359 1019  555 1448
 1313  410  770 1968  572 1906 1209 1458  866 2069  646 1602  172  904
 1829 1785 1220  322  777  153  530 1250  321 2017 1367  124 1441 1304
  255  182 1996 1488  516 1143  626  

# Feature Scaling

In [56]:
sc=StandardScaler()   #rescales the values from -1 to 1
x_train[:,1:]=sc.fit_transform(x_train[:,1:])
x_test[:,1:]=sc.fit_transform(x_test[:,1:])

In [57]:
print(x_train,"\n\n\n",x_test)

[[1663 1.4130550909113442 1.3713719067844863]
 [1906 0.29394282568133395 0.28430678415005795]
 [203 1.377593525673618 1.3871867392173016]
 ...
 [292 0.46930399906197956 0.44953073679431854]
 [185 -1.312575362092772 -1.3055050756072522]
 [1252 0.5933770081698148 0.6169186564411332]] 


 [[756 -1.3369803761155625 -1.344526961461553]
 [1446 0.4427592542302612 0.4183128699237334]
 [928 1.4757936397353264 1.4780494852582065]
 ...
 [107 0.1569334975801299 0.15006733925129048]
 [136 1.0516706589572897 1.0455242366393735]
 [552 -1.317763073525825 -1.3272826293035291]]
