In [23]:
import numpy as np
import pandas as pd

train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
train_data.shape, test_data.shape

((3000, 23), (4398, 22))

In [27]:
from ast import literal_eval
from collections import Counter
from sklearn.base import BaseEstimator, TransformerMixin

In [28]:
class TextToDictTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, features):
        self.features = features
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        for column in self.features:
            X[column] = X[column].apply(lambda x: {} if pd.isna(x) else literal_eval(x))
        return X
    
class BooleanTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, features):
        self.features = features
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        try:
            for column in self.features:
                X[column] = X[column].apply(lambda x: 1 if x != {} and pd.isna(x) == False else 0)
        except Exception as ex:
            print("Boolean transformer error:", ex)
        return X
    
class OneHotTransformer(BaseEstimator, TransformerMixin):

    def __init__(self, features, top_values):
        self.features = features
        self.top_values = top_values
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        try:
            i = 0
            for feature in self.features:
                for name in self.top_values[i]:
                    X[f'{feature}_{name}'] = X[feature].apply(lambda x: 1 if name in str(x) else 0)
                i += 1
                    
            X = X.drop(self.features, axis=1)
        except Exception as ex:
            print("One hot tansformer error:", ex)
        return X
    
class CastTransformer(BaseEstimator, TransformerMixin):

    def __init__(self, top_cast_names, top_cast_chars):
        self.top_cast_names = top_cast_names
        self.top_cast_chars = top_cast_chars
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        try:
            X['cast_len'] = X['cast'].apply(lambda x: len(x) if x != {} else 0)
            
            for name in self.top_cast_names:
                X[f'cast_name_{name}'] = X['cast'].apply(lambda x: 1 if name in str(x) else 0)
                
            for name in self.top_cast_chars:
                X[f'cast_char_{name}'] = X['cast'].apply(lambda x: 1 if name in str(x) else 0)
            
            X['cast_gender_undef'] = X['cast'].apply(lambda x: sum([1 for i in x if i['gender'] == 0]))
            X['cast_gender_male'] = X['cast'].apply(lambda x: sum([1 for i in x if i['gender'] == 1]))
            X['cast_gender_female'] = X['cast'].apply(lambda x: sum([1 for i in x if i['gender'] == 2]))
            
            X = X.drop('cast', axis=1)
        except Exception as ex:
            print("Cast transformer error:", ex)
        return X
    
class CrewTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, top_crew_names, top_crew_jobs, top_crew_departments):
        self.top_crew_names = top_crew_names
        self.top_crew_jobs = top_crew_jobs
        self.top_crew_departments = top_crew_departments
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        try:
            X['crew_len'] = X['crew'].apply(lambda x: len(x) if x != {} else 0)
            
            for name in self.top_crew_names:
                X[f'crew_name_{name}'] = X['crew'].apply(lambda x: 1 if name in str(x) else 0)
                
            for name in self.top_crew_jobs:
                X[f'crew_job_{name}'] = X['crew'].apply(lambda x: 1 if name in str(x) else 0)
                
            for name in self.top_crew_departments:
                X[f'crew_department_{name}'] = X['crew'].apply(lambda x: 1 if name in str(x) else 0)
            
            X['crew_gender_undef'] = X['crew'].apply(lambda x: sum([1 for i in x if i['gender'] == 0]))
            X['crew_gender_male'] = X['crew'].apply(lambda x: sum([1 for i in x if i['gender'] == 1]))
            X['crew_gender_female'] = X['crew'].apply(lambda x: sum([1 for i in x if i['gender'] == 2]))
            
            X = X.drop('crew', axis=1)
        except Exception as ex:
            print("Crew transformer error:", ex)
        return X
    
class DateTransformer(BaseEstimator, TransformerMixin):        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        try:            
            X['year'] = pd.Series(pd.DatetimeIndex(X['release_date']).year)
            X['month'] = pd.Series(pd.DatetimeIndex(X['release_date']).month)
            X['day'] = pd.Series(pd.DatetimeIndex(X['release_date']).day)
            X = X.drop('release_date', axis=1)
        except Exception as ex:
            print("Date transformer pipeline error:", ex)
        return X
    
class FixRevenueTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        try:
            X.loc[X['id'] == 16,'revenue'] = 192864          # Skinning
            X.loc[X['id'] == 90,'budget'] = 30000000         # Sommersby          
            X.loc[X['id'] == 118,'budget'] = 60000000        # Wild Hogs
            X.loc[X['id'] == 149,'budget'] = 18000000        # Beethoven
            X.loc[X['id'] == 313,'revenue'] = 12000000       # The Cookout 
            X.loc[X['id'] == 451,'revenue'] = 12000000       # Chasing Liberty
            X.loc[X['id'] == 464,'budget'] = 20000000        # Parenthood
            X.loc[X['id'] == 470,'budget'] = 13000000        # The Karate Kid, Part II
            X.loc[X['id'] == 513,'budget'] = 930000          # From Prada to Nada
            X.loc[X['id'] == 797,'budget'] = 8000000         # Welcome to Dongmakgol
            X.loc[X['id'] == 819,'budget'] = 90000000        # Alvin and the Chipmunks: The Road Chip
            X.loc[X['id'] == 850,'budget'] = 90000000        # Modern Times
            X.loc[X['id'] == 1112,'budget'] = 7500000        # An Officer and a Gentleman
            X.loc[X['id'] == 1131,'budget'] = 4300000        # Smokey and the Bandit   
            X.loc[X['id'] == 1359,'budget'] = 10000000       # Stir Crazy 
            X.loc[X['id'] == 1542,'budget'] = 1              # All at Once
            X.loc[X['id'] == 1570,'budget'] = 15800000       # Crocodile Dundee II
            X.loc[X['id'] == 1571,'budget'] = 4000000        # Lady and the Tramp
            X.loc[X['id'] == 1714,'budget'] = 46000000       # The Recruit
            X.loc[X['id'] == 1721,'budget'] = 17500000       # Cocoon
            X.loc[X['id'] == 1865,'revenue'] = 25000000      # Scooby-Doo 2: Monsters Unleashed
            X.loc[X['id'] == 2268,'budget'] = 17500000       # Madea Goes to Jail budget
            X.loc[X['id'] == 2491,'revenue'] = 6800000       # Never Talk to Strangers
            X.loc[X['id'] == 2602,'budget'] = 31000000       # Mr. Holland's Opus
            X.loc[X['id'] == 2612,'budget'] = 15000000       # Field of Dreams
            X.loc[X['id'] == 2696,'budget'] = 10000000       # Nurse 3-D
            X.loc[X['id'] == 2801,'budget'] = 10000000       # Fracture
            X.loc[X['id'] == 3889,'budget'] = 15000000       # Colossal
            X.loc[X['id'] == 6733,'budget'] = 5000000        # The Big Sick
            X.loc[X['id'] == 3197,'budget'] = 8000000        # High-Rise
            X.loc[X['id'] == 6683,'budget'] = 50000000       # The Pink Panther 2
            X.loc[X['id'] == 5704,'budget'] = 4300000        # French Connection II
            X.loc[X['id'] == 6109,'budget'] = 281756         # Dogtooth
            X.loc[X['id'] == 7242,'budget'] = 10000000       # Addams Family Values
            X.loc[X['id'] == 7021,'budget'] = 17540562       #  Two Is a Family
            X.loc[X['id'] == 5591,'budget'] = 4000000        # The Orphanage
            X.loc[X['id'] == 4282,'budget'] = 20000000       # Big Top Pee-wee

            power_six = X.id[X.budget > 1000][X.revenue < 100]

            for k in power_six :
                X.loc[X['id'] == k,'revenue'] =  X.loc[X['id'] == k,'revenue'] * 1000000
                
            return X
        
        except Exception as ex:
            print("Fix revenue transformer error:", ex)
            
class DropFeaturesTransformer(BaseEstimator, TransformerMixin):

    def __init__(self, features):
        self.features = features
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        try:
            return X.drop(self.features, axis=1)
        except Exception as ex:
            print("Drop features transformer error:", ex)
            

class TrainTestTransformer(BaseEstimator, TransformerMixin):        
    def __init__(self, impute=False, normalize=False):
        self.impute = impute
        self.normalize = normalize
        
    def fit(self, X, y=None):
        
        if self.impute:
            X = X.fillna(X.median())
    
        self.X = X.drop('revenue', axis=1)    
        self.y = X['revenue']
        
        if self.normalize:
            self.X = MinMaxScaler().fit_transform(self.X)
        
        return self
    
    def transform(self, X):
        return train_test_split(self.X, self.y, test_size=0.10)

def top_values(X, column, attribute):

    try:
        values = X[column].apply(lambda x: [i[attribute] for i in x] if x != {} else []).values
        top_values = Counter([j for i in values for j in i]).most_common(30)
        top_values = [i[0] for i in top_values]
        return top_values
    except Exception as ex:
        print(ex)

In [29]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split


text_to_dict = ['belongs_to_collection', 'genres', 'production_companies',
                'production_countries', 'spoken_languages', 'Keywords', 'cast', 'crew']

boolean_features = ['homepage', 'belongs_to_collection']

one_hot_objects = ['genres', 'production_countries', 'spoken_languages', 'production_companies']

drop_features = ['id',
                'original_language',
                'Keywords',
                'imdb_id',
                'status',
                'poster_path', 
                'original_title',
                'overview',
                'tagline',
                'title'
                ]


engineered_data = TextToDictTransformer(text_to_dict).transform(train_data.copy())

one_hot_top_values = [top_values(engineered_data, i, 'name')  for i in one_hot_objects]

feature_engineering_pipeline = Pipeline([
    ('boolean_transformer', BooleanTransformer(boolean_features)), 
    ('one_hot_transformer', OneHotTransformer(one_hot_objects, one_hot_top_values)),
    ('date_transformer', DateTransformer()),
    ('cast_transformer', CastTransformer(top_values(engineered_data, 'cast', 'name'),
                                         top_values(engineered_data, 'cast', 'character'))),
    ('crew_transformer', CrewTransformer(top_values(engineered_data, 'crew', 'name'),
                                         top_values(engineered_data, 'crew', 'job'),
                                         top_values(engineered_data, 'crew', 'department'))),
    ('fix_revenue_transformer', FixRevenueTransformer()),
    ('drop_features_transformers', DropFeaturesTransformer(drop_features)),
])

engineered_data = feature_engineering_pipeline.fit_transform(engineered_data)
X_train, X_valid, y_train, y_valid = TrainTestTransformer(impute=True, normalize=False).fit_transform(engineered_data)

In [56]:
from nni.algorithms.feature_engineering.gradient_selector import FeatureGradientSelector
from nni.algorithms.feature_engineering.gbdt_selector import GBDTSelector

In [149]:
fgs = FeatureGradientSelector(n_features=16, classification=False)

fgs.fit(X_train, np.log1p(y_train))
print(fgs.get_selected_features())
selected_features=fgs.get_selected_features()
type(selected_features)

[  0   1   3   4  11  14  21  23  25  55 180 181 213 216 224 257]


numpy.ndarray

In [150]:
X_train_selected=X_train.iloc[:,selected_features]
X_valid_selected=X_valid.iloc[:,selected_features]
X_train_selected.shape

(2700, 16)

In [151]:
print('X_train', X_train.shape)
print('y_train', y_train.shape)
print('X_valid', X_valid.shape)
print('y_valid', y_valid.shape)

X_train (2700, 258)
y_train (2700,)
X_valid (300, 258)
y_valid (300,)


In [152]:
sample_data = X_valid_selected[:]
sample_labels = np.log1p(y_valid[:])


num_models = 5
forest_reg_models = []

In [153]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

for i in range(num_models):
    forest_reg = RandomForestRegressor(n_estimators=100)
    forest_reg.fit(X_train_selected, np.log1p(y_train))

    preds = forest_reg.predict(sample_data)
    forest_mse = mean_squared_error(sample_labels, preds)
    forest_rmse = np.sqrt(forest_mse)
    
    forest_reg_models.append((forest_reg, forest_rmse))

In [154]:
res = pd.DataFrame({ 'Forest': [i[1] for i in forest_reg_models]})
res
best_forest_model, best_forest_model_rmse = sorted(forest_reg_models, key=lambda x: x[1])[0]
best_forest_model_rmse

1.9619701868165895