In [None]:
import numpy as np
ds['log_price'] = np.log(ds['price_usd'])                    #convert to log. Opposite is np.exp()
plt.scatter(ds2_s[:, [0]],ds2_s[:, [1]])                     # ds2_s[:, [0]]   - select 1  column of a np array

In [None]:
import pandas as pd
ds = pd.read_csv('workingData\cars.csv')
ds.drop("ID", inplace=True, axis=1)
ds['gender'].unique()
ds['gender'] = ds['gender'].map({'male': 1, 'female': 0})    # .replace can also be used.
ds = ds.dropna(axis=0)                                       #drop all missing values
q = ds['odometer_value'].quantile(0.99)                      #remove outliers
ds = ds[ds['odometer_value']<q]
ds = ds.reset_index(drop=True)                               #reset index
ds1 = pd.get_dummies(ds1,drop_first=True)                    # creating dummies for descret values

pd.options.display.max_rows = 999
pd.set_option('display.float_format', lambda x: '%.2f' % x)  #format % to 2 decimal
ds.sort_values('Diff' , ascending=False).head(99)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sb
sb.set()                                                     #set seaborn as default

plt.scatter(ds_x,ds_y,ds_c=z,cmap='rainbow', alpha=0.2)      #alpha like heat map  #color the dots based on z
plt.xlabel('x')

plt.plot([6.5, 10], [6.5, 10], color = 'black')              # line chart
plt.xlim(6,11)                                               # x scale limit

fig, axes = plt.subplots(3,3,figzize(10,2))                 # multiple charts improved
sns.histplot(ds['price_usd'], ax= axes(0,0))                                 # histogram
sns.boxplot((ds['price_usd'], ax= axes(0,1))
plt.tight_layout()
plt.show()

#f,(pt1,pt2,pt3) = plt.subplots(1,3,sharey=True, figsize = (15,3))  # multiple charts
#pt1.scatter(ds['odometer_value'],ds['log_price'])
#pt1.set_title('odo vs price')
#pt2.scatter(ds['year_produced'],ds['price_usd'])
#plt.show()

sb.displot(ds['price_usd'])                                   # distribution plot
sb.pairplot(ds)                                               #plot column matrix
sns.boxplot('price', by='year', rot=90)                             #box plot with xaxis as continues scale.
  plt.gca().xaxis.set_major_locator(plt.MaxNLocator(integer=True))
sns.heatmap(df.select_dtypes(include='number').corr().sort_values('EmbarkedInt', axis=0) , cmap='coolwarm')


## statsmodels

In [None]:
import statsmodels.api as sm
x = sm.add_constant(x1)
results = sm.OLS(y,x).fit()  #linear reg model using statsmodels
results.summary()
yhat = 53.0468 + 0.3933*x1   # const  + coef_*x1
plt.plot(x1,yhat, lw=1,c='black')

In [None]:
import statsmodels.api as sm
x = sm.add_constant(x1)
reg_log = sm.Logit(y,x)       ## Logistic model
result = reg_log.fit()
result.summary()
result.predict()
result.pred_table()           #confusion matrix
confusion_matrix = pd.DataFrame(result.pred_table(), columns=['Pred 0','Pred 1']) #load confusion matrix to df

In [None]:
#check if any columns have multicollinearity!!!!!
from statsmodels.stats.outliers_influence import variance_inflation_factor
variables = ds[['odometer_value','year_produced','engine_capacity']]
vif = pd.DataFrame()
vif['VIF'] = [variance_inflation_factor(variables.values, i) for i in range(variables.shape[1])]
vif['features'] = variables.columns
vif
# vif  - corelation of features to each other
# vif = 1 - no multicollinearity
# vif between 1 & 5 - ok
# vif > 5  - there is multicollinearity

## sklearn

In [None]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

numeric_features = ['sqft_living','bedrooms','price']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
    ,('scaler', StandardScaler())
])

categorical_features = ['waterfront','view']
categorical_transformer = Pipeline(steps= [
    ('impute', SimpleImputer(strategy='most_frequent'))
    ,('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])


df_transformed = preprocessor.fit_transform(df)  #apply imputer & other preprocessing

# Convert the transformed data back to a DataFrame
categorical_feature_names_onehot =  preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_features)
feature_names = list(numeric_features) + list(categorical_feature_names_onehot)

df_transformed = pd.DataFrame(df_transformed, columns=feature_names)  # Create the transformed DataFrame
df_not_transformed = df.drop(columns=numeric_features + categorical_features).reset_index(drop=True)  # Combine with the remaining columns
df_final = pd.concat([df_not_transformed, df_transformed], axis=1)
df_final.head(10)

In [None]:
# impute missing values when dataset is small, else drop missing and outliers.
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(np.array(ds['engine_capacity']).reshape(-1, 1))
ds['engine_capacity'] = imputer.transform(np.array(ds['engine_capacity']).reshape(-1, 1))

In [None]:
#split the data to Train and Test.  use random state to get repetable random data in both sets.
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest = train_test_split(inputs,targets,test_size=0.2, random_state=3)

In [None]:
## feature scaling. do it after the data is split, to avoid impact of test dataset
## input is scalled so that the impact is equal.  ((Value - Mean)/standard deviation)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
xtrain_scale = xtrain
xtrain_scale[['col1','col2']] = scaler.fit_transform(xtrain_scale[['col1','col2']])  # Ignore inserted dummies .we can use fit and then transform as well.
xtest_scale = scaler.transform(xtest)       # only transform the test data based on the scalled(fit) training data, else range will defer.

y_hat = scaler_y.inverse_transform( reg.predict(xtrain_scale))

from sklearn import preprocessing   # other way to scale !!!
ds2_s = preprocessing.scale(ds2)

In [None]:
#  Linear Regression !!!!!!!!!!!!!!!!!  for linear
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(xtrain_scaled,ytrain)

y_hat = reg.predict(xtrain_scaled)               #predit the price based on trained model #using training data itself
reg.score(xtrain_scaled,ytrain)                  # r-squared
reg.coef_                                 #returns array for coiefecents for all x features (input)
reg.intercept_                            # returns constant. Beta_0

In [None]:
#ploynomial reg
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=3)
xtrain_poly = poly.fit_transform(xtrain.reshape(-1, 1))

from sklearn.linear_model import LinearRegression
Polyreg = LinearRegression()
Polyreg.fit(xtrain_poly,ytrain)

In [None]:
#  SVR Regression !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
from sklearn.svm import SVR
reg = SVR()
reg.fit(xtrain_scale,ytrain_scale)
y_hat = reg.predict(xtrain_scale)

In [None]:
# k means clustering
from sklearn.cluster import KMeans
kmeans = KMeans(2)
kmeans.fit(ds2_s)
ds3['cluster_prediction'] = kmeans.fit_predict(ds2_s)

wcss = []                          ## wcss within-cluster sum of squares ## distance between points in a cluster  ## error
for i in range(1,20):              #range of clusters. wcss can be used to draw elbow chart to determine ideal no of clusters
    kmeans = KMeans(i)
    kmeans.fit(ds2_s)
    wcss.append(kmeans.inertia_)   #wcss
plt.plot(range(1,20), wcss)

## Tensor flow 2.10

In [None]:
import tensorflow as tf
np.savez('tf_introduction', tfinputs=inputs, tftargets=targets)   # save data as tensors

tfdata = np.load('tf_introduction.npz')
ip_size =2
op_size =1

model = tf.keras.Sequential([tf.keras.layers.Dense(
    op_size
    ,kernel_initializer=tf.random_uniform_initializer(-0.1,0.1)   #you can leave out, it will default
    ,bias_initializer=tf.random_uniform_initializer(-0.1,0.1)
    )])
custom_optimizer = tf.keras.optimizers.SGD(learning_rate=0.2)
model.compile(optimizer=custom_optimizer, loss='mean_squared_error')   # sgd - stochastic gradientdescent
model.fit(tfdata['tfinputs'], tfdata['tftargets'], epochs=100, verbose=0)

weights = model.layers[0].get_weights()
weights

In [None]:
## Artificial Neural Network
#encode all column to numeric eg gender to 0/1
#create dummy for categorical
#split the data to Train and Test.
#scale the features (input)

#sample classification for Yes/No result
xtrain_sig = np.array(xtrain_scaled[['odometer_value','year_produced','engine_capacity']])
ytrain_sig = np.array(xtrain_scaled[['engine_fuel_diesel']])

ann = tf.keras.models.Sequential()                               # innitialize model
ann.add(tf.keras.layers.Dense(units=6, activation='relu'))       # 1st hidden layer with 6 neurons, rectifier activation function
ann.add(tf.keras.layers.Dense(units=6, activation='relu'))       # 2nd hidden layer with 6 neurons,
ann.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))       # output layer with sigmoid for 1/0 output.
ann.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])    #compile the model
ann.fit(xtrain_sig, ytrain_sig, batch_size=32, epochs=100)

y_hat=ann.predict(np.array(xtrain_scaled))
ann.layers[0].get_weights()

In [None]:
# Convolution Neural Network

train_images = train_images / 255.0   #Scale the data
#build model
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Conv2D(filters=32, kernel_size=3, strides=2, activation='relu', input_shape=(150,150,3))) #convolution feature detection.  (kernel/filter size)
model.add(tf.keras.layers.MaxPooling2D(pool_size=2, strides=2))   #mean max sum pooling
model.add(tf.keras.layers.Conv2D(filters=32, kernel_size=3, strides=2, activation='relu')) ## ip shape is not req. rectifier activation
model.add(tf.keras.layers.MaxPooling2D(pool_size=2, strides=2))   #pooling
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(units=128, activation=tf.nn.relu))
model.add(tf.keras.layers.Dense(units=2, activation=tf.nn.softmax))

model.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(train_images, train_indexs, batch_size=100, epochs=10, validation_split = 0.2)

predictions = model.predict(train_images)     # Vector of probabilities
pred_labels = np.argmax(predictions, axis = 1) # We take the highest probability