<a href="https://colab.research.google.com/github/taylorfogarty/launch/blob/master/regression_vs_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

###Regression

In [0]:
import sklearn
from sklearn.linear_model import ElasticNet, Lasso, Ridge, LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
import pandas as pd 
import matplotlib
from matplotlib import pyplot
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

In [0]:
from google.colab import files
uploaded = files.upload()

In [0]:
data = pd.read_csv("avocado.csv")
data = data.drop(['Unnamed: 0','Date', 'region','Total Volume','Total Bags'], axis = 1)
data = pd.get_dummies(data, drop_first=True)
y = data['AveragePrice']
X = data.drop('AveragePrice', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=.1)

In [250]:
lm = LinearRegression()
lm_fit = lm.fit(X_train, y_train)
lm_predict = lm_fit.predict(X_test)

#lr = LogisticRegression()
#lr_fit = lr.fit(X_train,y_train)
#lr_predict = lr_fit.predict(X_test)
## will not work because the range of y is 0 to 3.25, not 0 to 1

ridge = Ridge()
ridge_fit = ridge.fit(X_train,y_train)
r_predict = ridge_fit.predict(X_test)

lasso = Lasso(max_iter=1000)
lasso_fit = lasso.fit(X_train,y_train)
l_predict = lasso_fit.predict(X_test)

ENet = ElasticNet(l1_ratio=.01)
ENet_fit = ENet.fit(X_train,y_train)
en_predict = ENet_fit.predict(X_test)

## there are no higher order variables in this dataset, but for the sake of exemplifying the procedure,
## pretend there is a quadratic relationship between the number of 4225 avocados and the price
data1 = data.head(500)
data1['4225_sq'] = 0
for i in range(len(data1['4225'])):
  data1['4225_sq'][i] = (data1['4225'][i])**2
y2 = data1['AveragePrice']
X2 = data1.drop('AveragePrice', axis=1)
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2,y2, test_size=.1)
lm_fit = lm.fit(X_train2, y_train2)
ho_predict = lm_fit.predict(X_test2)

rand = RandomForestRegressor()
rand_fit = rand.fit(X_train,y_train)
rf_predict = rand_fit.predict(X_test)



In [252]:
print('Linear Model MSE:' , mean_squared_error(y_test,lm_predict))
#mean_squared_error(y_test,lr_predict)
print('Ridge Model MSE:' , mean_squared_error(y_test,r_predict))
print('Lasso Model MSE:' , mean_squared_error(y_test,l_predict))
print('Elastic Net Model MSE:' , mean_squared_error(y_test,en_predict))
print('Higher Order Model MSE:' , mean_squared_error(y_test2,ho_predict))
print('Random Forest MSE:' , mean_squared_error(y_test,rf_predict))

Linear Model MSE: 0.09792038897928204
Ridge Model MSE: 0.09791772364668704
Lasso Model MSE: 0.14654262570004578
Elastic Net Model MSE: 0.12919487097231586
Higher Order Model MSE: 0.015512612920132457
Random Forest MSE: 0.034497292054794515


###Classification

In [0]:
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix
import pandas as pd

In [0]:
from google.colab import files
uploaded = files.upload()

In [0]:
data = pd.read_csv("train.csv")
data = data.drop(['Name','Ticket','Cabin'], axis=1)
data = pd.get_dummies(data)
data = data.fillna(0)

y = data['Survived']
X = data.drop('Survived', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=.1)

In [236]:
data.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,1,3,22.0,1,0,7.25,0,1,0,0,1
1,2,1,38.0,1,0,71.2833,1,0,1,0,0
2,3,3,26.0,0,0,7.925,1,0,0,0,1
3,4,1,35.0,1,0,53.1,1,0,0,0,1
4,5,3,35.0,0,0,8.05,0,1,0,0,1


In [0]:
lr = LogisticRegression()
lr_fit = lr.fit(X_train,y_train)

lasso2 = LogisticRegression(penalty='l1')
lasso2_fit = lasso2.fit(X_train,y_train)

ridge2 = LogisticRegression(penalty='l2')
ridge2_fit = ridge2.fit(X_train,y_train)

rand_for = RandomForestClassifier(n_estimators = 500, random_state = 40)
rand_for_fit = rand_for.fit(X_train,y_train)

clf = MLPClassifier(activation='logistic', solver='lbfgs',learning_rate='adaptive', alpha=.0005)
clf_fit = clf.fit(X_train, y_train)

In [273]:
print('Logistic F Score:',lr_fit.score(X_test, y_test))
print('LASSO F Score:',lasso2_fit.score(X_test, y_test))
print('Ridge F Score:',ridge2_fit.score(X_test, y_test))
print('Random Forest Classifier F Score:',rand_for_fit.score(X_test, y_test))
print('Neural Net F Score:',clf_fit.score(X_test, y_test))

Logistic F Score: 0.8555555555555555
LASSO F Score: 0.8555555555555555
Ridge F Score: 0.8555555555555555
Random Forest Classifier F Score: 0.8777777777777778
Neural Net F Score: 0.7333333333333333
