In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sb 

from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler 
from sklearn.linear_model import LogisticRegression 
from sklearn.svm import SVC 
from xgboost import XGBClassifier 
from sklearn import metrics 

import warnings 
warnings.filterwarnings('ignore')


In [None]:
df = pd.read_csv('TESLA (2).csv') 
df.head()


In [None]:
#From the first five rows, we can see that data for some of the dates is missing the reason for that is on weekends and holidays Stock Market remains closed hence no trading happens on these days.

In [None]:
df.shape

In [None]:
df.describe()


In [None]:
df.info()

In [None]:
plt.figure(figsize=(10,5)) 
plt.plot(df['Close']) 
plt.title('Tesla Close price.', fontsize=15) 
plt.ylabel('Price in dollars.') 
plt.show()


In [None]:
#The prices of tesla stocks are showing an upward and downward trend as depicted by the plot of the closing price of the stocks.

In [None]:
df.head()

In [None]:
#If we observe carefully we can see that the data in the ‘Close’ column and that available in the ‘Adj Close’ column is the same let’s check whether this is the case with each row or not.

In [None]:
df[df['Close']==df['Adj Close']].shape

In [None]:
#From here we can conclude that all the rows of columns ‘Close’ and ‘Adj Close’ have the same data. So, having redundant data in the dataset is not going to help so, we’ll drop this column before further analysis.

In [None]:
df=df.drop(['Adj Close'],axis=1)

In [None]:
#Now let’s draw the distribution plot for the continuous features given in the dataset.

#Before moving further let’s check for the null values if any are present in the data frame.

In [None]:
df.isnull().sum()

In [None]:
#This implies that there are no null values in the data set provided.

In [None]:
features=['Open','High','Low','Close','Volume']
for i, col in enumerate(features):
 plt.subplot(2,3,i+1)
 sb.distplot(df[col])
plt.show()

In [None]:
#In the distribution plot of OHLC data, we can see two peaks which means the data has varied significantly in two regions. And the Volume data is left-skewed.



In [None]:
plt.subplots(figsize=(20,10)) 
for i, col in enumerate(features):
 plt.subplot(2,3,i+1)
 sb.boxplot(df[col])
plt.show()

In [None]:
#From the above boxplots, we can conclude that only volume data contains outliers in it but the data in the rest of the columns are free from any outlier.

In [None]:
#Feature Engineering helps to derive some valuable features from the existing ones. These extra features sometimes help in increasing the performance of the model significantly and certainly help to gain deeper insights into the data.

In [None]:
splitted = df['Date'].str.split('/',expand=True) 
 

df['day'] = splitted[1].astype('int') 
df['month'] = splitted[0].astype('int') 
df['year'] = splitted[2].astype('int') 

df.head()


In [None]:
df['is_quarter_end'] = np.where(df['month']%3==0,1,0) 
df.head()


In [None]:
data_grouped = df.drop('Date', axis=1).groupby('year').mean()
plt.subplots(figsize=(20,10))

for i, col in enumerate(['Open', 'High', 'Low', 'Close']):
  plt.subplot(2,2,i+1)
  data_grouped[col].plot.bar()
plt.show()

In [None]:
df.drop('Date', axis=1).groupby('is_quarter_end').mean()

In [None]:
#A quarter is defined as a group of three months. Every company prepares its quarterly results and publishes them publicly so, that people can analyze the company’s performance. These quarterly results affect the stock prices heavily which is why we have added this feature because this can be a helpful feature for the learning model.

In [None]:
#Here are some of the important observations of the above-grouped data:

#Prices are higher in the months which are quarter end as compared to that of the non-quarter end months.
#The volume of trades is lower in the months which are quarter end.

In [None]:
df['open-close'] = df['Open'] - df['Close'] 
df['low-high'] = df['Low'] - df['High'] 
df['target'] = np.where(df['Close'].shift(-1) > df['Close'], 1, 0)


In [None]:
plt.pie(df['target'].value_counts().values, 
		labels=[0, 1], autopct='%1.1f%%') 
plt.show()


In [None]:
#When we add features to our dataset we have to ensure that there are no highly correlated features as they do not help in the learning process of the algorithm.

In [None]:
df_numeric = df.select_dtypes(include=['float64', 'int64'])

# Calculate the correlation matrix on the numerical columns
corr_matrix = df_numeric.corr()

# Plot the heatmap
plt.figure(figsize=(5, 5))
sb.heatmap(corr_matrix > 0.9, annot=True, cbar=False)
plt.show()

In [None]:
#From the above heatmap, we can say that there is a high correlation between OHLC that is pretty obvious, and the added features are not highly correlated with each other or previously provided features which means that we are good to go and build our model

In [None]:
#DATA SPLITTING AND NORMALIZATION
features = df[['open-close', 'low-high', 'is_quarter_end']] 
target = df['target'] 

scaler = StandardScaler() 
features = scaler.fit_transform(features) 

X_train, X_valid, Y_train, Y_valid = train_test_split( 
	features, target, test_size=0.1, random_state=2022) 
print(X_train.shape, X_valid.shape) 


In [None]:
#After selecting the features to train the model on we should normalize the data because normalized data leads to stable and fast training of the model. After that whole data has been split into two parts with a 90/10 ratio so, that we can evaluate the performance of our model on unseen data.

In [None]:
models = [LogisticRegression(), SVC( 
  kernel='poly', probability=True), XGBClassifier()] 
  
for i in range(3): 
  models[i].fit(X_train, Y_train) 
  
  print(f'{models[i]} : ') 
  print('Training Accuracy : ', metrics.roc_auc_score( 
    Y_train, models[i].predict_proba(X_train)[:,1])) 
  print('Validation Accuracy : ', metrics.roc_auc_score( 
    Y_valid, models[i].predict_proba(X_valid)[:,1])) 
  print()

In [None]:
#Among the three models, we have trained XGBClassifier has the highest performance but it is pruned to overfitting as the difference between the training and the validation accuracy is too high. But in the case of the Logistic Regression, this is not the case.



In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# Assuming models[1] is your classifier and xtest, ytest are your test data
y_pred = models[1].predict(X_valid)

# Plot confusion matrix
ConfusionMatrixDisplay.from_estimator(models[1], X_valid, Y_valid)
plt.show()