In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [18]:
usecols = ["pclass", "sibsp", "parch", "sex", "embarked", "cabin", "survived"]
data = pd.read_csv('titanic.csv')
data['Cabin'].head(2)

0    NaN
1    C85
Name: Cabin, dtype: object

In [19]:
data['Cabin'] = data['Cabin'].str[0]
data['Cabin'].head(2)

0    NaN
1      C
Name: Cabin, dtype: object

In [20]:
# One-hot-encoding: importance b/c methods learn pattern from train data,
# and avoid leaking information and overfitting.

# let's separate into training and testing set

X_train, X_test, y_train, y_test = train_test_split(
    data.drop("Survived", axis=1),  # predictors
    data["Survived"],  # target
    test_size=0.3,  # percentage of obs in test set
    random_state=0,  # seed to ensure reproducibility
)

X_train.shape, X_test.shape

((623, 11), (268, 11))

In [22]:
X_train['Sex'].unique()

array(['male', 'female'], dtype=object)

In [23]:


# embarked has 3 labels and missing data

X_train["Embarked"].unique()



array(['S', 'C', 'Q', nan], dtype=object)

#### One hot encoding with pandas
* quick
* returns pandas dataframe
* returns feature names for the dummy variables


In [24]:
# Into k dummy varible
tmp = pd.get_dummies(X_train['Sex'])
tmp.head()

Unnamed: 0,female,male
857,False,True
52,True,False
386,False,True
124,False,True
578,True,False


In [25]:
# for better visualisation let's put the dummies next
# to the original variable

pd.concat([X_train["Sex"], pd.get_dummies(X_train["Sex"])], axis=1).head()


Unnamed: 0,Sex,female,male
857,male,False,True
52,female,True,False
386,male,False,True
124,male,False,True
578,female,True,False


In [26]:
# and now let's repeat for embarked

tmp = pd.get_dummies(X_train["Embarked"])

tmp.head()


Unnamed: 0,C,Q,S
857,False,False,True
52,True,False,False
386,False,False,True
124,False,False,True
578,True,False,False


In [27]:
X_train_enc = pd.get_dummies(X_train)

print(X_train_enc.shape)

X_train_enc.head()

(623, 1152)


Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,"Name_Abbing, Mr. Anthony","Name_Abbott, Mr. Rossmore Edward","Name_Abelson, Mrs. Samuel (Hannah Wizosky)","Name_Adahl, Mr. Mauritz Nils Martin",...,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Embarked_C,Embarked_Q,Embarked_S
857,858,1,51.0,0,0,26.55,False,False,False,False,...,False,False,False,True,False,False,False,False,False,True
52,53,1,49.0,1,0,76.7292,False,False,False,False,...,False,False,True,False,False,False,False,True,False,False
386,387,3,1.0,5,2,46.9,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
124,125,1,54.0,0,1,77.2875,False,False,False,False,...,False,False,True,False,False,False,False,False,False,True
578,579,3,,1,0,14.4583,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False


In [29]:


# obtaining k-1 labels: we need to indicate get_dummies
# to drop the first binary variable

tmp = pd.get_dummies(X_train["Sex"], drop_first=True)

tmp.head()



Unnamed: 0,male
857,True
52,False
386,True
124,True
578,False


In [35]:


# load dataset

data = pd.read_csv('housingprice.csv',usecols=["Neighborhood", "Exterior1st", "Exterior2nd", "SalePrice"],
)

data.head()



Unnamed: 0,Neighborhood,Exterior1st,Exterior2nd,SalePrice
0,CollgCr,VinylSd,VinylSd,208500
1,Veenker,MetalSd,MetalSd,181500
2,CollgCr,VinylSd,VinylSd,223500
3,Crawfor,Wd Sdng,Wd Shng,140000
4,NoRidge,VinylSd,VinylSd,250000


In [36]:
for col in data.columns:
    print(col,':',len(data[col].unique()),'labels')

Neighborhood : 25 labels
Exterior1st : 15 labels
Exterior2nd : 16 labels
SalePrice : 663 labels


In [43]:
X_train, X_test, y_train, y_test = train_test_split(
    data[["Neighborhood", "Exterior1st", "Exterior2nd"]],  # predictors
    data["SalePrice"],  # target
    test_size=0.3,  # percentage of obs in test set
    random_state=0,
)  # seed to ensure reproducibility

X_train.shape, X_test.shape

((1022, 3), (438, 3))

##### Count and Frequency encoding with pandas

In [44]:
count_map = X_train['Neighborhood'].value_counts().to_dict()
count_map

{'NAmes': 151,
 'CollgCr': 105,
 'OldTown': 73,
 'Edwards': 71,
 'Sawyer': 61,
 'Somerst': 56,
 'Gilbert': 55,
 'NWAmes': 51,
 'NridgHt': 51,
 'SawyerW': 45,
 'BrkSide': 41,
 'Mitchel': 36,
 'Crawfor': 35,
 'Timber': 30,
 'NoRidge': 30,
 'ClearCr': 24,
 'IDOTRR': 24,
 'SWISU': 18,
 'StoneBr': 16,
 'Blmngtn': 12,
 'MeadowV': 12,
 'BrDale': 10,
 'NPkVill': 7,
 'Veenker': 6,
 'Blueste': 2}

In [45]:
# replace labels with count
X_train['Neighborhood'] = X_train['Neighborhood'].map(count_map)
X_test['Neighborhood'] = X_test['Neighborhood'].map(count_map)


In [46]:
# let's explore the result

X_train["Neighborhood"].head(10)

64      105
682      24
960      41
1384     71
1100     18
416      61
1034     35
853     151
472      71
1011     71
Name: Neighborhood, dtype: int64

In [47]:
# if instead of the count we would like the frequency
# we need only divide the count by the total number of observations:

frequency_map = (X_train["Exterior1st"].value_counts(normalize=True)).to_dict()
frequency_map

{'VinylSd': 0.3561643835616438,
 'HdBoard': 0.149706457925636,
 'Wd Sdng': 0.14481409001956946,
 'MetalSd': 0.1350293542074364,
 'Plywood': 0.08414872798434442,
 'CemntBd': 0.03816046966731898,
 'BrkFace': 0.03424657534246575,
 'WdShing': 0.02054794520547945,
 'Stucco': 0.016634050880626222,
 'AsbShng': 0.014677103718199608,
 'Stone': 0.0019569471624266144,
 'AsphShn': 0.0009784735812133072,
 'BrkComm': 0.0009784735812133072,
 'ImStucc': 0.0009784735812133072,
 'CBlock': 0.0009784735812133072}

In [49]:
#replace the labels with the frequencies

X_train["Exterior1st"] = X_train["Exterior1st"].map(frequency_map)
X_test["Exterior1st"] = X_test["Exterior1st"].map(frequency_map)
