# Feature Engineering
> 1.Detect and drop the low variable features
<br>
> 2.drop columns that are highly correlated


In [2]:
import pandas as pd
import glob
import numpy as np
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import MinMaxScaler

In [3]:
filename_test = glob.glob('BlogFeedback' + '/*test*.csv')
list_ = []
for file in filename_test:
    df = pd.read_csv(file, header=None)
    list_.append(df)
df_test = pd.concat(list_)
df_train = pd.read_csv('BlogFeedback/blogData_train.csv',header=None)

In [4]:
df_train.columns = [str(i) for i in range(1,282)]
df_test.columns = [str(i) for i in range(1,282)]

## Detect features has low variable

features with low variable are dropped from the dataset in this step, which are:
<br>
> Feature 13: min value of the comments received in mid 24h time interval of the source blog
<br>
> Feature 33: min value of the trackbacks received in last 24h before the basetime of the source blog
<br>
> Feature 38: min value of the trackbacks received in mid 24h time interval of the source blog
<br>
> Feature 278: min value of the comments parent page received

In [35]:
selector = VarianceThreshold()
selector.fit_transform(df_train)
df_train.columns[~selector.get_support()]

Index(['13', '33', '38', '278'], dtype='object')

In [5]:
df_train = df_train.iloc[:,selector.get_support()]

In [6]:
df_test = df_test.iloc[:,selector.get_support()]

In [7]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52397 entries, 0 to 52396
Columns: 277 entries, 1 to 281
dtypes: float64(277)
memory usage: 110.7 MB


In [8]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7624 entries, 0 to 94
Columns: 277 entries, 1 to 281
dtypes: float64(277)
memory usage: 16.2 MB


## Drop columns that are highly correlated 

In [10]:
corr_matrix = df_train.corr().abs()

In [11]:
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

In [12]:
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]

In [49]:
df_train = df_train.drop(to_drop, axis=1)
df_test = df_test.drop(to_drop, axis=1)

In [50]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52397 entries, 0 to 52396
Columns: 227 entries, 1 to 281
dtypes: float64(227)
memory usage: 90.7 MB


## save csv file

In [14]:
df_train.to_csv('blog_feedback_train.csv',header = df_train.columns,index=False)
df_test.to_csv('blog_feedback_test.csv',header = df_test.columns,index=False)