In [1]:
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn import preprocessing
import sklearn.model_selection as ms
import sklearn.metrics as sklm
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import numpy.random as nr
import seaborn as sns
from datetime import date
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.decomposition import PCA, FactorAnalysis
from sklearn.covariance import ShrunkCovariance, LedoitWolf
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import scipy.stats as ss
from scipy.stats import skew
import math
from scipy.stats import norm
import warnings
warnings.filterwarnings('ignore')

KeyboardInterrupt: 

In [None]:
TrainData = pd.read_csv('train.csv')
TestData = pd.read_csv('test.csv')


In [None]:
# Attempting a Different methodology when working with factor analysis.
# Since it's dimensional reduction, -- so I don't see a particular need to reduce features.
#TrainTarget = TrainData[['SalePrice']]
TrainTarget = TrainData['SalePrice']


In [None]:
TrainData.drop(['SalePrice'], axis =1, inplace=True) #So long as properly ordered, this will be fine.


In [None]:
TrainData.head(5)

In [None]:
#Attempting maximum potential coverage of dummy variables.
TrainTest = TrainData.append(TestData) 
TrainTest.to_csv('TrainTestCheck1.csv')

In [None]:
TrainTest.select_dtypes(include='object').isnull().sum()[TrainTest.select_dtypes(include='object').isnull().sum()>0]

In [None]:
TrainTest.head(5)

In [None]:
for col in ('Alley','Utilities','MasVnrType','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1',
            'BsmtFinType2','Electrical','FireplaceQu','GarageType','GarageFinish','GarageQual','GarageCond',
           'PoolQC','Fence','MiscFeature'):
    TrainTest[col]=TrainTest[col].fillna('None')

In [None]:
TrainTest.select_dtypes(include='object').isnull().sum()[TrainTest.select_dtypes(include='object').isnull().sum()>0]

In [None]:
for col in ('MSZoning','Exterior1st','Exterior2nd','KitchenQual','SaleType','Functional'):
    TrainTest[col]=TrainTest[col].fillna(TrainTest[col].mode()[0])

In [None]:
TrainTest.select_dtypes(include=['int','float']).isnull().sum()[TrainTest.select_dtypes(include=['int','float']).isnull().sum()>0]

In [None]:
for col in ('LotFrontage','MasVnrArea','BsmtFinSF1','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF','BsmtFullBath','BsmtHalfBath','GarageCars','GarageArea','GarageYrBlt'):
    TrainTest[col]=TrainTest[col].fillna(0)

In [None]:
#Harmonizing coding on 'Quality' metrics. Overall Qual is numeric, while these others are categorical.
#Ordinal assignments seem most appropriate for 9 point scales
for col in ('ExterQual','HeatingQC','FireplaceQu','GarageQual','KitchenQual','BsmtQual','BsmtCond','GarageCond','PoolQC'):
    TrainTest[col][TrainTest[col] == 'Ex'] = 9
    TrainTest[col][TrainTest[col] == 'Gd'] = 7
    TrainTest[col][TrainTest[col] == 'TA'] = 5
    TrainTest[col][TrainTest[col] == 'Fa'] = 3
    TrainTest[col][TrainTest[col] == 'Po'] = 1
    TrainTest[col][TrainTest[col] == 'NA'] = 0
    TrainTest[col][TrainTest[col] == 'None'] = 0
    TrainTest[col].astype('int64')

In [None]:
TrainTest.to_csv('TrainTestCheck2.csv')

In [None]:
TrainTest = pd.get_dummies(TrainTest,dummy_na=False)

In [None]:
TrainTest.to_csv('TrainTestCheck3.csv')

In [None]:
TrainTest.select_dtypes(include='object').isnull().sum()[TrainTest.select_dtypes(include='object').isnull().sum()>0]
TrainTest.select_dtypes(include=['int','float']).isnull().sum()[TrainTest.select_dtypes(include=['int','float']).isnull().sum()>0]

In [None]:
#Appending PCA Scores --> array must not contain infs or NaNs. Where do they come from?
#I think I want to delete any column that is 100% 0.
#These appear to be unwelcome remnants of get_dummies -- dummy_na = True. Perhaps set to false, I'll find this step unnecessary. 
#(TrainTest != 0).any(axis=0) ==True
TrainTest = TrainTest.loc[:, (TrainTest != 0).any(axis=0)]


In [None]:
#Got array must not contain infs or NaNs and then LinAlgError: SVD did not converge errors. 
#Here's a script to get rid of infs and nans.
TrainTest = TrainTest.replace(np.inf, np.nan).replace(-np.inf, np.nan).dropna()

TrainTest.to_csv('TrainTestCheck4.csv')

In [None]:
TrainTest.isnull().values.any()  #No null values

In [None]:
#A problem with running factor analysis - - 
#Standard methods of performing factor analysis ( i.e., those based on a matrix of Pearson’s correlations) 
#assume that the variables are continuous and follow a multivariate normal distribution

#What do I do, then? 
#Tetrachoric correlations as the underlying logic for the factor analysis appear to be the path forward.
#But I'll need to figure out what python module will allow this.
#I may end up trying PCA instead.



In [None]:
#Factor Analysis and PCA require scaling.
sc=RobustScaler()
TrainTest=sc.fit_transform(TrainTest.values)

TrainTest = pd.DataFrame(TrainTest)
TrainTest.to_csv('TrainTestCheck5.csv')

from sklearn.datasets import load_digits
from sklearn.decomposition import FactorAnalysis
from sklearn.model_selection import GridSearchCV
#transformer = FactorAnalysis(n_components=7, random_state=0)
#TrainTest_transformed = transformer.fit_transform(TrainTest)
#TrainTest_transformed.shape

#Pulling the PCA/Factor Analysis Code straight from Scikit's Example
#https://scikit-learn.org/stable/auto_examples/decomposition/plot_pca_vs_fa_model_selection.html#sphx-glr-auto-examples-decomposition-plot-pca-vs-fa-model-selection-py
n_features = len(TrainTest.columns)-1
n_components = np.arange(0, n_features, 5)  # options for n_components


def shrunk_cov_score(X):
    shrinkages = np.logspace(-2, 0, 30)
    cv = GridSearchCV(ShrunkCovariance(), {'shrinkage': shrinkages}, cv=5)
    return np.mean(cross_val_score(cv.fit(X).best_estimator_, X, cv=5))


def compute_scores(X):
    pca = PCA(svd_solver='full')
    fa = FactorAnalysis()

    pca_scores, fa_scores = [], []
    for n in n_components:
        pca.n_components = n
        fa.n_components = n
        pca_scores.append(np.mean(cross_val_score(pca, X, cv=5)))
        fa_scores.append(np.mean(cross_val_score(fa, X, cv=5)))

    return pca_scores, fa_scores


def compute_scoresfa(X):
    
    fa = FactorAnalysis()
    fa_scores = []
    for n in n_components:
        fa.n_components = n
        fa_scores.append(np.mean(cross_val_score(fa, X, cv=5)))

    return fa_scores

def lw_score(X):
    return np.mean(cross_val_score(LedoitWolf(), X, cv=5))


pca_scores, fa_scores = compute_scores(TrainTest)
n_components_pca = n_components[np.argmax(pca_scores)]
#n_components_fa = n_components[np.argmax(fa_scores)]
#fa_scores = compute_scoresfa(TrainTest)
#n_components_fa = n_components[np.argmax(fa_scores)]

pca = PCA(svd_solver='full', n_components='mle')
pca.fit_transform(TrainTest)
#n_components_pca_mle = pca.n_components_

#print("best n_components by PCA CV = %d" % n_components_pca)
#print("best n_components by FactorAnalysis CV = %d" % n_components_fa)
#print("best n_components by PCA MLE = %d" % n_components_pca_mle)



In [None]:
#Since I"m having trouble with Scikit, I'm thinking I might take a look at the statsmodels implementation of PCA.
#Recall with PCA -- that when we deploy, we're predicting with the PCA model's components: the combo features that it spits out
#Like factor analysis -- it's a set of predictors that replace the original columns.

from statsmodels.multivariate.pca import PCA
pca_model = PCA(TrainTest, standardize=False, demean=True)
fig = pca_model.plot_scree(log_scale=False)

In [None]:
PCAFac = pd.DataFrame(pca_model.factors)
PCAFac.to_csv('smPCAFac.csv')

fig, ax = plt.subplots(figsize=(8, 4))
lines = ax.plot(pd.DataFrame(pca_model.factors).iloc[:,:10], lw=4, alpha=.6)
ax.set_xticklabels(TrainTest.columns.values[::10])
ax.set_xlim(0, 51)
fig.subplots_adjust(.1, .1, .85, .9)
legend = fig.legend(lines, ['PC 1', 'PC 2', 'PC 3', 'PC 4', 'PC 5', 'PC 6'], loc='center right')
legend.draw_frame(False)

In [None]:
Train = TrainTest.iloc[:1460]
Test = TrainTest.iloc[1460:]

In [None]:
#sns.distplot(TrainData['TotRmsAbvGrd']);
#sns.distplot(TrainData['1stFlrSF']);
#sns.distplot(TrainData['OverallQual']);
#sns.distplot(TrainData['FullBath']);
#sns.distplot(TrainData['YearRemodAdd']);
#sns.distplot(TrainData['YearBuilt']);
#sns.distplot(TrainData['ExterQual']);
#sns.distplot(TrainData['LotFrontage']);
sns.distplot(np.log(TrainTarget));

In [None]:
Train.to_csv('Traincheck.csv')
Test.to_csv('TestCheck.csv')
y=TrainTarget

In [None]:
y=np.log(y)
#Test = Test.iloc[:,1:]

In [None]:
#Setting Up a Poisson Regression Model -- statsmodels.genmod.families.family.Poisson
#Concept - dataset is right skewed. A regression that models a right-skewed distribution is worth investigating.
PoiReg = sm.GLM(y,Train,data=Train, family=sm.families.Poisson()).fit()

#mod1 = smf.glm(formula=formula, data=dta, family=sm.families.Binomial()).fit()

In [None]:
Test.iloc[:,1:].head(5)

In [None]:
Train.iloc[:,1:].head(5)

In [None]:
#print PoiReg.summary()
Result = pd.DataFrame(PoiReg.predict(Test))
Result = np.exp(Result)

In [None]:
Result.to_csv('resultreg14.csv')
#Output = pd.DataFrame({'Id':TestData[['Id']], 'SalePrice':Result})
#Output.to_csv('submissiontmb.csv', index=False)