In [7]:
# %load ../standard_import.txt
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt

from IPython.display import Image
from patsy import dmatrices

from sklearn.preprocessing import scale 
from sklearn.grid_search import GridSearchCV
from sklearn import cross_validation
from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV, ElasticNet, ElasticNetCV, SGDRegressor
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import PLSRegression, PLSSVD
from sklearn.metrics import mean_squared_error

pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 150)
pd.set_option('display.max_seq_items', None)

%matplotlib inline

import seaborn as sns
sns.set_context('notebook')
sns.set_style('darkgrid')

In [8]:
df = pd.read_csv('Mydata.csv')

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1063 entries, 0 to 1062
Data columns (total 54 columns):
Unnamed: 0    1063 non-null int64
country       1063 non-null object
househld      1063 non-null int64
rsex          1063 non-null object
rage          0 non-null float64
marstat6      1063 non-null object
childhh       1063 non-null object
married       1063 non-null object
readpap       1063 non-null object
whpaper       383 non-null object
webnews       1063 non-null object
wnwsite1      542 non-null object
supparty      1059 non-null object
closepty      688 non-null object
partyidn      976 non-null object
partyid1      976 non-null object
politics      1063 non-null object
soctrust      1052 non-null object
ecpolicy      974 non-null object
govtrust      1057 non-null object
mpstrust      1058 non-null object
conlabdf      1020 non-null object
spend1        1058 non-null object
dole          1029 non-null object
taxspend      1050 non-null object
incomgap      1035 non-null 

In [17]:
df.head(10)

   Unnamed: 0   country  househld    rsex  rage  \
0           1   England         5    Male   NaN   
1           2  Scotland         1    Male   NaN   
2           3   England         5  Female   NaN   
3           4     Wales         1  Female   NaN   
4           5   England         3  Female   NaN   
5           6     Wales         7  Female   NaN   
6           7   England         4    Male   NaN   
7           8   England         2    Male   NaN   
8           9     Wales         1    Male   NaN   
9          10   England         3  Female   NaN   

                                            marstat6 childhh  \
0                                            Married     Yes   
1                                            Married      No   
2                                            Married     Yes   
3               Divorced/dissolved civil partnership      No   
4                                            Married      No   
5                              Living with a partner  

In [22]:
columns_list = list(df.columns.values)

In [23]:
columns_list

['Unnamed: 0',
 'country',
 'househld',
 'rsex',
 'rage',
 'marstat6',
 'childhh',
 'married',
 'readpap',
 'whpaper',
 'webnews',
 'wnwsite1',
 'supparty',
 'closepty',
 'partyidn',
 'partyid1',
 'politics',
 'soctrust',
 'ecpolicy',
 'govtrust',
 'mpstrust',
 'conlabdf',
 'spend1',
 'dole',
 'taxspend',
 'incomgap',
 'ub1poor',
 'ccbeliev',
 'letin',
 'miecono',
 'micultur',
 'reasmig',
 'asastay',
 'friendob',
 'raceori3',
 'religion',
 'hedqual',
 'rearnd',
 'welffeet',
 'forgrel1',
 'forgrel2',
 'forgrel4',
 'forgrel5',
 'mnntpowr',
 'unionsa',
 'reconact',
 'natbrit',
 'nateng',
 'nateuro',
 'natscot',
 'natwelsh',
 'natni',
 'natasia',
 'natafric']

In [24]:
for col in columns_list:
    df[col] = df[col].astype('category')

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1063 entries, 0 to 1062
Data columns (total 54 columns):
Unnamed: 0    1063 non-null category
country       1063 non-null category
househld      1063 non-null category
rsex          1063 non-null category
rage          0 non-null category
marstat6      1063 non-null category
childhh       1063 non-null category
married       1063 non-null category
readpap       1063 non-null category
whpaper       383 non-null category
webnews       1063 non-null category
wnwsite1      542 non-null category
supparty      1059 non-null category
closepty      688 non-null category
partyidn      976 non-null category
partyid1      976 non-null category
politics      1063 non-null category
soctrust      1052 non-null category
ecpolicy      974 non-null category
govtrust      1057 non-null category
mpstrust      1058 non-null category
conlabdf      1020 non-null category
spend1        1058 non-null category
dole          1029 non-null category
taxspend      

In [29]:
df['Unnamed: 0'] = df['Unnamed: 0'].astype(int)
df['househld'] = df['househld'].astype(int)

In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1063 entries, 0 to 1062
Data columns (total 54 columns):
Unnamed: 0    1063 non-null int32
country       1063 non-null category
househld      1063 non-null int32
rsex          1063 non-null category
rage          0 non-null category
marstat6      1063 non-null category
childhh       1063 non-null category
married       1063 non-null category
readpap       1063 non-null category
whpaper       383 non-null category
webnews       1063 non-null category
wnwsite1      542 non-null category
supparty      1059 non-null category
closepty      688 non-null category
partyidn      976 non-null category
partyid1      976 non-null category
politics      1063 non-null category
soctrust      1052 non-null category
ecpolicy      974 non-null category
govtrust      1057 non-null category
mpstrust      1058 non-null category
conlabdf      1020 non-null category
spend1        1058 non-null category
dole          1029 non-null category
taxspend      1050 n