In [1]:
''' Identify important content-level video features for each pipeline.
Challenge: 1. many features are highly correlated. how to reduce redundancy.
           2. identify "all" important features.
           
High-level description of our feature selection (two steps):
            1. Remove highly correlated features.
            2. Identify important features using different kinds of feature selection methods.
'''

' Identify important content-level video features for each pipeline.\nChallenge: 1. many features are highly correlated. how to reduce redundancy.\n           2. identify "all" important features.\n           \nHigh-level description of our feature selection (two steps):\n            1. Remove highly correlated features.\n            2. Identify important features using different kinds of feature selection methods.\n'

In [1]:
import os
from collections import defaultdict
import glob
import sys
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from sklearn.svm import SVR
from sklearn import preprocessing
from sklearn.feature_selection import SelectFromModel, SelectKBest, mutual_info_regression
from sklearn.ensemble import ExtraTreesClassifier, RandomForestRegressor
from sklearn.linear_model import (LinearRegression, Ridge, 
								  Lasso)
from sklearn.feature_selection import RFE, f_regression
from sklearn.preprocessing import MinMaxScaler
from minepy import MINE
import operator
from mpl_toolkits.mplot3d import Axes3D
sys.path.append('../')
from pipeline_performance_loader import Parser, initialization, read_feature
from sklearn.base import BaseEstimator, TransformerMixin
from VIF import ReduceVIF
import seaborn as sns
from scipy.stats import pearsonr

In [2]:
# Load pipeline performance
all_feature_names, moving, video_to_delete, selected_video, glimpse_video_to_delete = initialization()
path =  '../feature_analysis/video_features_30s/'
# feature_file = path + 'features_all_type_width_height_filter.csv'
feature_file = path + 'allvideo_features_long_add_width_20_filter.csv'
features = read_feature(feature_file)
video_to_delete = ['nyc', 'russia', 'tw', 'crossroad2','downtown','tw1','bridge','walking'
]
awstream_perf = {}
keys = []
with open('../awstream/awstream_selected_video_resol_0.9_label_merge_add_width_20_filter.csv', 'r') as f:
    f.readline()
    for line in f:
        line_list = line.strip().split(',')
        dataset_name = line_list[0].replace('_' + 
                       line_list[0].split('_')[-1], '')
        if dataset_name in video_to_delete:
            continue
        key = line_list[0]
        resol = int(line_list[2].replace('p', ''))
        f1 = float(line_list[3])
        bw = float(line_list[4])
        awstream_perf[key] = (bw, f1, resol)
        
target_perf = awstream_perf



driving2_3,0.0,0.0,0,0.0,0.0,0.0,-3.0,0.0,0.0,0.0,0.0,0.0,0.0,nan,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.017777777777777778,0,0.03079506172839507,0.17548521797688565,10.38655522428052,109.89372638584705,0.03079506172839507,0.0,0.0,0.0,0.0,0.0,2.252728336819822,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,0.0,0.0,0.0,0.0,0.0,nan,0.0,0.0,0

driving2_8,0.0,0.0,0,0.0,0.0,0.0,-3.0,0.0,0.0,0.0,0.0,0.0,0.0,nan,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,-3.0,0.0,0.0,0.0,0.0,0.0,0.0,nan,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,0.0,0.0,0.0,0.0,0.0,nan,0.0,0.0,0

driving2_27,0.0,0.0,0,0.0,0.0,0.0,-3.0,0.0,0.0,0.0,0.0,0.0,0.0,nan,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,-3.0,0.0,0.0,0.0,0.0,0.0,0.0,nan,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,0.0,0.0,0.0,0.0,0.0,nan,

FileNotFoundError: [Errno 2] No such file or directory: '../awstream/awstream_selected_video_resol_0.9_label_merge_add_width_20_filter.csv'

In [None]:
path = '/Users/zhujunxiao/Desktop/benchmarking/vldb/data/awstream/overfitting_results_30s_30s_label_merge/'
awstream_profile = defaultdict(list)
for file in glob.glob(path + 'awstream_spatial_overfitting_profile*.csv'):
	with open(file, 'r') as f:
		f.readline()
		for line in f:
			line_list = line.strip().split(',')
			key = line_list[0]
			resol = line_list[1]
			f1 = float(line_list[3])
			awstream_profile[key].append(f1)


                


In [None]:
X = []
y = []
feature1 = []
feature2 = []
perf = []

feature_perf_profile = {}
for key in sorted(target_perf.keys()):
    if key not in features:
#         print('feature for {} not found'.format(key))
        continue

    # data cleaning
    if features[key][all_feature_names.index('object_cn_avg')] <= 0:
        continue
    if features[key][all_feature_names.index('velocity_avg')] < 1:
        continue
    if features[key][all_feature_names.index('object_size_avg')] <= 0:
        continue
    thresh1 = 0.05		
#     if np.abs(target_perf[key][1] - 0.9) > thresh1:
#         continue
    if target_perf[key][1] < 0.9:
        continue
    X.append(features[key])
    y.append(target_perf[key][0])        
        
    #************only for AWStream*******
    
#     if  total_object_cn[key] < 200:
#         continue
#     if features[key][all_feature_names.index('percentage')] > 0.8:
#         continue
#     if features[key][0] < 2:
#         continue    
    #************************************
#     feature_perf_profile[key] = [target_perf[key], features[key], awstream_profile[key]]
    
# all_dataset = ['crossroad', 'crossroad2', 'crossroad3', 'crossroad4', 'drift',
#                'driving1', 'driving2', 'highway', 'motorway', 'park', 'trip',
#                'nyc', 'russia', 'russia1', 'tw', 'tw1','jp', 'downtown', 
#                'bridge', 'walking','hw', 'traffic', 'normal_traffic', 'split']
# for dataset in all_dataset:
#     feature_x = []
#     perf_y = []
#     for key in feature_perf_profile.keys():
#         if dataset in key:
#             feature_x.append(feature_perf_profile[key][1])
#             perf_y.append(feature_perf_profile[key][0])
    
#     target_feature_index = all_feature_names.index('object_size_avg')
#     plt.scatter([x[target_feature_index] for x in feature_x], [x[2] for x in perf_y])
#     plt.title(dataset)
#     plt.xlim(0, 0.04)
#     plt.ylim(0,800)
#     plt.show()
    

#     plt.annotate(key, (features[key][all_feature_names.index('object_size_avg')], awstream_profile[key][1]))


# plt.xscale('logit')
 

In [None]:
df = pd.DataFrame(X, columns=all_feature_names)

plt.scatter(df['object_size_percentile10'], y)
plt.ylim(0,1)
plt.xlim(0.001,0.025)
plt.xscale('logit')
plt.show()
# for name in all_feature_names:
#     (r, p) = pearsonr(df[name], y)
#     if abs(r) > 0.3:
#         print(name, r, p)
#         plt.scatter(df[name], y)
#         plt.title(name)
        
#         plt.show()


# new_df = df[['percentage','percentage_w_new_object', 'velocity_avg', 'velocity_var', 'arrival_rate_avg', 'arrival_rate_var','object_cn_avg', 'object_cn_var',
#             'object_size_avg','object_size_var', 'total_area_avg','total_area_var']]

# hist = new_df.hist(bins=10, figsize=(12, 12))



In [None]:
# Create our function which stores the feature rankings to the ranks dictionary
def ranking(ranks, names, order=1):
    minmax = MinMaxScaler()
    ranks = minmax.fit_transform(order*np.array([ranks]).T).T[0]
    ranks = map(lambda x: round(x,2), ranks)
    return dict(zip(names, ranks))


def rank_to_dict(ranks, names, order=1):
#     minmax = MinMaxScaler()
#     ranks = minmax.fit_transform(order*np.array([ranks]).T).T[0]
    ranks = map(lambda x: round(x, 2), ranks)
    return dict(zip(names, ranks ))

def topK_index(data, K):
    indices = data.argsort()[-1*K:][::-1]
    return indices, data[indices]

In [None]:
def feature_filtering(df, filter_method='pearson'):
	# remove correlated features
	if filter_method == 'VIF':
		# filter 
		transformer = ReduceVIF(thresh=5)
		df_filtered = transformer.fit_transform(df)
		return df_filtered
	elif filter_method == 'pearson':
		# filter feateures with pearson correlation higher than a thresh
		corr_matrix = df.corr()
		correlated_features = set()
		thresh = 0.8
		for i in range(len(corr_matrix.columns)):
			for j in range(i):
				if abs(corr_matrix.iloc[i, j]) > thresh:
					colname = corr_matrix.columns[i]
					correlated_features.add(colname)
		df_filtered = df.drop(correlated_features, axis=1)
		return df_filtered

	
	else:
		print('Filter method {} does not exist.'.format(filter_method))
		return df

In [None]:
# preprocessing: standardization, and train test split
print('Preprocessing starts (normalization, train_test_split)......')
scaler = preprocessing.StandardScaler().fit(X)
X_scaled = scaler.transform(X)  
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, 
                                    test_size=0.2, random_state=0)

# remove correlated features, using two methods
df = pd.DataFrame(X_train, columns=all_feature_names)
df_filtered_pearson = feature_filtering(df)
print('After pearson correlation filtering, remaining features:', df_filtered_pearson.columns)
df_filtered_vif = feature_filtering(df, filter_method='VIF')
print('After VIF filtering, remaining features:', df_filtered_vif.columns)

# visualize correlation matrix before and after filtering
df['perf'] = y_train
df_filtered_pearson['perf'] = y_train
df_filtered_vif['perf'] = y_train

# f,(ax1,ax2,ax3) = plt.subplots(1,3,sharey=True)
cor = df.corr()
sns.heatmap(cor, vmin=-1, vmax=1, center=0)
plt.title('Correlation matrix before feature filtering.')
plt.show()
cor = df_filtered_vif.corr()
sns.heatmap(cor, vmin=-1, vmax=1, center=0)
plt.title('Correlation matrix after VIF filtering.')
plt.show()
cor = df_filtered_pearson.corr()
sns.heatmap(cor, vmin=-1, vmax=1, center=0)
plt.title('Correlation matrix before feature filtering.')
plt.show()	


In [None]:
# Implement multiple types of feature selection methods. And compare their selected results.

def select_good_features(X, Y, names, n_features_to_select=5):
    ranks = {}
    lr = LinearRegression(normalize=True)
    lr.fit(X, Y)
    ranks["Linear"] = rank_to_dict(np.abs(lr.coef_), names)

    ridge = Ridge(alpha=7)
    ridge.fit(X, Y)
    ranks["Ridge"] = rank_to_dict(np.abs(ridge.coef_), names)


    lasso = Lasso(alpha=.05)
    lasso.fit(X, Y)
    ranks["Lasso"] = rank_to_dict(np.abs(lasso.coef_), names)


    #stop the search when 5 features are left (they will get equal scores)
    rfe = RFE(lr, n_features_to_select=5)
    rfe.fit(X,Y)
    ranks["RFE"] = rank_to_dict(rfe.ranking_, names, order=-1)

    rf = RandomForestRegressor()
    rf.fit(X,Y)
    ranks["RF"] = rank_to_dict(rf.feature_importances_, names)


    f, pval  = f_regression(X, Y, center=True)
    ranks["Corr."] = rank_to_dict(f, names)

    mine = MINE()
    mic_scores = []
    for i in range(X.shape[1]):
        mine.compute_score(X[:,i], Y)
        m = mine.mic()
        mic_scores.append(m)

    ranks["MIC"] = rank_to_dict(mic_scores, names) 
    r = {}
    for name in names:
        r[name] = round(np.mean([ranks[method][name] 
                                 for method in ranks.keys()]), 2)

    methods = sorted(ranks.keys())
    ranks["Mean"] = r
    methods.append("Mean")

  

    # rank = [np.abs(x) for x in lr.feature_importances_]
    # indicies = topK_index(np.asarray(rank), 3)

    # for i in indicies[0]:
    # 	print(all_feature_names[i])

    return methods, ranks





In [None]:
features = df.drop(['perf'], axis=1)
X = features.as_matrix()
Y = df['perf']
names = features.columns.values
print('The shape of feature matrix:', X.shape)
print('all feature names:', names)
methods, ranks = select_good_features(X, Y, names, n_features_to_select=5)


In [None]:
print("%30s\t%s" % ('Feature name', "\t".join(methods)))
for name in names:
    print("%30s\t%s" % (name, "\t".join(map(str, 
                         [ranks[method][name] for method in methods])))) 

In [None]:
for method in methods:
    new_rank = {k: v for k, v in sorted(ranks[method].items(), key=lambda item: item[1], reverse=True)}
    rank_iterator = iter(new_rank)
    print('Selected features of method {}:'.format(method))
    for i in range(3):
        selected_feature = next(rank_iterator)
        feature_importance = new_rank[selected_feature]
        print(selected_feature, feature_importance)


In [None]:
features = df_filtered_pearson.drop(['perf'], axis=1)
X = features.as_matrix()
Y = df_filtered_pearson['perf']
names = features.columns.values
print('The shape of feature matrix:', X.shape)
print('all feature names:', names)
methods, ranks = select_good_features(X, Y, names, n_features_to_select=5)


In [None]:
print("%30s\t%s" % ('Feature name', "\t".join(methods)))
for name in names:
    print("%30s\t%s" % (name, "\t".join(map(str, 
                         [ranks[method][name] for method in methods])))) 

In [None]:
for method in methods:
    new_rank = {k: v for k, v in sorted(ranks[method].items(), key=lambda item: item[1], reverse=True)}
    rank_iterator = iter(new_rank)
    print('Selected features of method {}:'.format(method))
    for i in range(3):
        selected_feature = next(rank_iterator)
        feature_importance = new_rank[selected_feature]
        print(selected_feature, feature_importance)

In [None]:
features = df_filtered_vif.drop(['perf'], axis=1)
X = features.as_matrix()
Y = df_filtered_vif['perf']
names = features.columns.values
print('The shape of feature matrix:', X.shape)
print('all feature names:', names)
methods, ranks = select_good_features(X, Y, names, n_features_to_select=5)


In [None]:
print("%30s\t%s" % ('Feature name', "\t".join(methods)))
for name in names:
    print("%30s\t%s" % (name, "\t".join(map(str, 
                         [ranks[method][name] for method in methods])))) 

In [None]:
for method in methods:
    new_rank = {k: v for k, v in sorted(ranks[method].items(), key=lambda item: item[1], reverse=True)}
    rank_iterator = iter(new_rank)
    print('Selected features of method {}:'.format(method))
    for i in range(3):
        selected_feature = next(rank_iterator)
        feature_importance = new_rank[selected_feature]
        print(selected_feature, feature_importance)

In [None]:
from scipy.stats import pearsonr

features = df.drop(['perf'], axis=1)
X = features.as_matrix()
Y = df['perf']
names = features.columns.values
correlation_thresh = 0.3
correlated_features = []
for name in names:
    (r, p) = pearsonr(df[name], df['perf'])
    if np.abs(r) > correlation_thresh:
        correlated_features.append((name,r))
correlated_features.sort(key=lambda x: x[1], reverse=True)
print(correlated_features)
selected_features = [correlated_features[0][0]]
for feature in correlated_features[1:]:
    (r, p) = pearsonr(df[feature[0]], df[correlated_features[0][0]])
    if np.abs(r) <= 0.8:
        selected_features.append(feature[0])
print('Final selected feature:', selected_features)