In [None]:
#! kaggle competitions download -c tmdb-box-office-prediction

In [None]:
%matplotlib inline
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
# import data processing libs
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', 500)
# import visualisation libs
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 150
plt.style.use('ggplot')

In [None]:
# Read files and prepare full dataset
sample_submission = pd.read_csv('sample_submission.csv')
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

full = train.append(test, ignore_index=True, verify_integrity=True, sort=False)

train = full.iloc[:3000]
test = full.iloc[3000:]

assert(train.shape[1] == test.shape[1])
assert(full.shape == (train.shape[0] + test.shape[0], train.shape[1]))

print('train', train.shape)
print('test', test.shape)
print('full', full.shape)

In [None]:
numeric_cols = train.select_dtypes(include=np.number).columns.tolist()
string_cols = train.select_dtypes(include=np.object).columns.tolist()

print(numeric_cols)
print('\n')
print(string_cols)

In [None]:
# Posterpath base url http://image.tmdb.org/t/p/original/

In [None]:
def _get_collection_name(obj):
    import numpy
    return list(map(str.strip, obj.split(",")))[1][9:-12] if isinstance(obj, str) else np.nan

def _jsonify(obj):
    import json
    from json import JSONDecodeError
    import numpy as np
    import re
    
    if isinstance(obj, str):
        obj = re.sub('(?<=[a-z])\'(?=[A-z])', '', obj)
        obj = obj.replace("Donners'", "Donners") \
                      .replace("O'Connor Brothers", "O Connor Brothers") \
                      .replace("d'Azur", "d Azur") \
                      .replace("l'Audiovisuel", "l Audiovisuel") \
                      .replace("Mel's", "Mel s") \
                      .replace("d'Animation", "d Animation") \
                      .replace("Loew's", "Loews") \
                      .replace('"Tor"', 'Tor') \
                      .replace("L'image", "L image") \
                      .replace("d'Aosta", "d Aosta") \
                      .replace("I'm", "Im") \
                      .replace("d'Ici", "d Ici") \
                      .replace('"DIA"', 'DIA') \
                      .replace("Kids'", "Kids") \
                      .replace("l'Eure", "l Eure") \
                      .replace("It's", "Its") \
                      .replace("Isn't", "Isnt") \
                      .replace('"Tsar"', 'Tsar') \
                      .replace("D'Antoni", "D Antoni") \
                      .replace("Gettin'", "Gettin") \
                      .replace("L'image", "L image") \
                      .replace("L'Aide", "L Aide") \
                      .replace("Hell's", "Hells") \
                      .replace("Bull's", "Bulls") \
                      .replace("Cooper's", "Coopers") \
                      .replace("Children's", "Childrens") \
                      .replace("Whitaker's", "Whitakers") \
                      .replace("Jing's", "Jings") \
                      .replace("D'Artagnan", "D Artagnan") \
                      .replace("L'Alma", "L Alma") \
                      .replace("l'Egalité", "l Egalité") \
                      .replace("Performers'", "Performers") \
                      .replace("Butcher's", "Butchers") \
                      .replace("Large's", "Larges") \
                      .replace("We're", "Were") \
                      .replace("d'Investissement", "d Investissement") \
                      .replace("d'Or", "d Or") \
                      .replace("Po'", "Po") \
                      .replace("O' Salvation", "O Salvation") \
                      .replace("Cast N'", "Cast N") \
                      .replace("L'Institut", "L Institut") \
                      .replace("Project '98", "Project 98") \
                      .replace("Ninjas Runnin' ", "Ninjas Runnin ") \
                      .replace('"Kvadrat"', 'Kvadrat') \
                      .replace("l'Europe", "l Europe") \
                      .replace("l'Audiovisuel", "l Audiovisuel") \
                      .replace("Quat'sous", "Quat sous") \
                      .replace("l'Amour", "l Amour") \
                      .replace("l'Image", "l Image") \
                      .replace("Betsy's", "Betsys") \
                      .replace("l'Audiovisuel", "l Audiovisuel") \
                      .replace("Winter's", "Winters") \
                      .replace("That's", "Thats") \
                      .replace("d'Etat", "d Etat") \
                      .replace("Devil's", "Devils") \
                      .replace("Willie's", "Willies") \
                      .replace("Cote D'Ivoire", "Cote D Ivoire") \
                      .replace("mama's", "mamas") \
                      .replace("girls' ", "girls") \
                      .replace("rock 'n' roll", "rock n roll") \
                      .replace("artists' ", "artists") \
                      .replace("workers' ", "workers") \
                      .replace("students' ", "students") \
                      .replace("ladies' ", "ladies") \
                      .replace("years' ", "years") \
                      .replace("boys' ", "boys") \
                      .replace("'", '"') \
                      .replace("\\xa0", "")
            
        try:
            return json.loads(obj)
        except JSONDecodeError:
            return obj
        
    else: return np.nan
    
    
def _unnest_feature(df, col, key):
    import pandas as pd
    import numpy as np
    features_list = [_jsonify(obj) for obj in df[col].values]
    flat_features_list = []

    for inner_list in features_list:
        if isinstance(inner_list, list):
            features = {}
            for i, dictionary in enumerate(inner_list):
                features[col+'_{}'.format(i)] = dictionary[key]
            features_df = pd.DataFrame.from_dict(features, orient='index').T
            flat_features_list.append(features_df)

        elif np.isnan(inner_list):
            #print("missing")
            features = {}
            features[col+'_0'] = np.nan
            features_df = pd.DataFrame.from_dict(features, orient='index').T
            flat_features_list.append(features_df)
    
    features_df = pd.concat([frame for frame in flat_features_list], axis='index', ignore_index=True, sort=False)
    
    assert(df.shape[0] == features_df.shape[0])
    return pd.concat([df, features_df], axis='columns')

def _get_cast(obj):
    import numpy as np
    
    if isinstance(obj, str) and obj != "[]":
        return list(filter(lambda x: ('cast_id' in x or 'character' in x or 'name' in x or 'gender' in x or 'order' in x), \
                           obj.split(",")))
    
    else: return np.nan
    
def _count_substring(listlist, substring):
    """
    Count occurance of substring in string.

    :param listlist: list of lists. type list.
    :returns: counts_list. typelist. each inner_list containing a sum of substring.
    """
    import numpy as np
    
    counts_list = []
    for list in listlist:
        counts = [string.count(substring) for string in list]
        counts_list.append(np.sum(counts))
    
    return np.array(counts_list)

def _get_lead_cast_feats(df, casts):
    import pandas as pd
    import numpy as np
    names = []
    genders = []
    for cast in casts:
        if len(cast) > 0:
            top5_cast_names = list(filter(lambda x: 'name' in x, cast))[:5]        
            leading_cast_names = [name[10: -1] for name in top5_cast_names]
            leading_cast_names_df = pd.DataFrame(leading_cast_names).T.rename({0:'lead1', 1:'lead2', 2:'lead3', 3:'lead4', 4:'lead5'}, axis='columns')
            names.append(leading_cast_names_df)

            top5_cast_gender = list(map(_map_gender, list(filter(lambda x: 'gender' in x, cast))[:5]))
            leading_cast_genders_df = pd.DataFrame(top5_cast_gender).T.rename({0:'lead1_gender', 1:'lead2_gender', 2:'lead3_gender', 3:'lead4_gender', 4:'lead5_gender'}, axis='columns')
            genders.append(leading_cast_genders_df)
        else:
            # cast missing
            names.append(pd.DataFrame([np.nan,np.nan,np.nan,np.nan,np.nan]).T \
                          .rename({0:'lead1', 1:'lead2', 2:'lead3', 3:'lead4', 4:'lead5'}, axis='columns'))
            genders.append(pd.DataFrame([np.nan,np.nan,np.nan,np.nan,np.nan]).T \
                          .rename({0:'lead1_gender', 1:'lead2_gender', 2:'lead3_gender', 3:'lead4_gender', 4:'lead5_gender'}, axis='columns'))

    leading_cast_names_df = pd.concat([df for df in names], axis='index', ignore_index=True, sort=False)
    leading_cast_genders_df = pd.concat([df for df in genders], axis='index', ignore_index=True, sort=False)
    assert(df.shape[0] == leading_cast_names_df.shape[0] == leading_cast_genders_df.shape[0])
    
    return pd.concat([df, leading_cast_names_df, leading_cast_genders_df], axis='columns', sort=False)

def _map_gender(string):
    import numpy as np
    if isinstance(string, str):
        if string.strip() == "'gender': 1":
            return 'Female'
        elif string.strip() == "'gender': 2":
            return 'Male'
        else:
            return 'Unspecified'
    else: return np.nan
    
def plot_information_content(feats):
    import matplotlib.pyplot as plt
    
    information_content = feats.notna().sum()/feats.shape[0]
    fig, ax = plt.subplots(figsize=(15,40))
    information_content.plot('barh', ax=ax)
    plt.tight_layout()

def create_features(df):
    df = df.copy()
    
    df.belongs_to_collection = df.belongs_to_collection.apply(lambda x: _get_collection_name(x))
    df['is_sequel'] = np.where(df.belongs_to_collection.notna(), 1, 0)
    
    df = _unnest_feature(df, col='genres', key='name')
    df['genres_cnt'] = _count_substring(full.genres.fillna("").apply(lambda x: x.split(",")), "'name'")
    
    df['overview_len'] = np.array([len(row) for row in df.overview.fillna("").values])
    df['tagline_len'] = np.array([len(row) for row in df.tagline.fillna("").values])
    df['original_title_len'] = np.array([len(row) for row in df.original_title.fillna("").values])
    df['title_contains_The'] = [1 if 'The' in title[:3] else 0 for title in df.original_title.fillna("").values]
    df['is_rebranded'] = np.where(df.original_title != df.title, 1, 0)
    
    df = _unnest_feature(df, col='production_companies', key='name')
    df['production_companies_cnt'] = _count_substring(full.production_companies.fillna("").apply(lambda x: x.split(",")), "'name'")
    df = _unnest_feature(df, col='production_countries', key='name')
    df['production_countries_cnt'] = _count_substring(full.production_countries.fillna("").apply(lambda x: x.split(",")), "'name'")
    df = _unnest_feature(df, col='spoken_languages', key='name')
    df['spoken_languages_cnt'] = _count_substring(full.spoken_languages.fillna("").apply(lambda x: x.split(",")), "'name'")
    df['is_foreign'] = np.where(df.original_language != "en", 1, 0)
    df = _unnest_feature(df, col='Keywords', key='name')
    df['keywords_cnt'] = _count_substring(full.Keywords.fillna("").apply(lambda x: x.split(",")), "'name'")
    
    df.cast = df.cast.apply(lambda x: _get_cast(x))
    casts = df.cast.fillna("")
    df['cast_size'] = _count_substring(casts, "cast_id")
    df['missing_gender_cast'] = _count_substring(casts, "'gender': 0")
    df['female_cast'] = _count_substring(casts, "'gender': 1")
    df['male_cast'] = _count_substring(casts, "'gender': 2")
    assert((df.female_cast + df.male_cast + df.missing_gender_cast == df.cast_size).all())
    df['male_quota'] = df.male_cast/ df.cast_size
    df['female_quota'] = 1 - df.male_quota
    df = _get_lead_cast_feats(df, casts)
    
    df.release_date = pd.to_datetime(df.release_date, infer_datetime_format=True)
    df.release_date.mask(df.release_date > '2020', np.nan, inplace=True)
    df['year'] = df.release_date.apply(lambda x: x.date().year)
    df['month'] = df.release_date.apply(lambda x: x.date().month)
    df['day'] = df.release_date.apply(lambda x: x.date().day)
    df['week'] = df.release_date.apply(lambda x: x.week)
    df['summer_movie'] = np.select([df.month == 6, df.month == 7, df.month == 8], [1,1,1])
    df['winter_movie'] = np.where(df.month == 12, 1, 0)
    
    df['budget'] = np.log1p(df.budget)
    df['revenue'] = np.log1p(df.revenue)
    
    #df.sort_index(axis=1, inplace=True)
    
    return df

def prepare_df(df):
    import pandas as pd
    import numpy as np
    
    df = df.copy()
    drop_cols = ['homepage', 'imdb_id', 'poster_path', 'crew', 'cast', 'gender', 'overview', 'tagline', 'genres',
                'production_companies', 'production_countries', 'spoken_languages', 'Keywords', 'title']
    information_content = df.notna().sum()/df.shape[0]
    low_information_content = [information_content < 0.05]
    keep_cols = ['revenue', 'belongs_to_collection']
    low_information_content_cols = pd.DataFrame(low_information_content[0]).rename({0:'low_info'}, axis='columns').query("low_info == True").index.tolist()
    low_information_content_cols = [col for col in low_information_content_cols if col not in keep_cols]
    low_information_content_cols = []
    
    shape_before = df.shape
    
    for col in drop_cols + low_information_content_cols:
        if col in df.columns.tolist():
            df.drop(col, axis=1, inplace=True)
            
    print("Dropped {} cols".format(shape_before[1] - df.shape[1]))
    
    # Type conversions
    df.belongs_to_collection = df.belongs_to_collection.astype('category')
    df.original_language = df.original_language.astype('category')
    df.original_title = df.original_title.astype('category')
    df.release_date = df.release_date.astype('category')
    df.status = df.status.astype('category')
    df.is_sequel = df.is_sequel.astype('category')

    df.genres_0 = df.genres_0.astype('category')
    df.genres_1 = df.genres_1.astype('category')
    df.genres_2 = df.genres_2.astype('category')

    df.title_contains_The = df.title_contains_The.astype('category')
    df.is_rebranded = df.is_rebranded.astype('category')

    df.production_companies_0 = df.production_companies_0.astype('category')
    df.production_companies_1 = df.production_companies_1.astype('category')
    df.production_companies_2 = df.production_companies_2.astype('category')
    df.production_countries_0 = df.production_countries_0.astype('category')

    df.spoken_languages_0 = df.spoken_languages_0.astype('category')
    df.is_foreign = df.is_foreign.astype('category')

    df.Keywords_0 = df.Keywords_0.astype('category')
    df.Keywords_1 = df.Keywords_1.astype('category')
    df.Keywords_2 = df.Keywords_2.astype('category')
    df.Keywords_3 = df.Keywords_3.astype('category')
    df.Keywords_4 = df.Keywords_4.astype('category')
    df.Keywords_5 = df.Keywords_5.astype('category')
    df.Keywords_6 = df.Keywords_6.astype('category')

    df.lead1 = df.lead1.astype('category')
    df.lead2 = df.lead2.astype('category')
    df.lead3 = df.lead3.astype('category')
    df.lead4 = df.lead4.astype('category')
    df.lead5 = df.lead5.astype('category')

    df.lead1_gender = df.lead1_gender.astype('category')
    df.lead2_gender = df.lead2_gender.astype('category')
    df.lead3_gender = df.lead3_gender.astype('category')
    df.lead4_gender = df.lead4_gender.astype('category')
    df.lead5_gender = df.lead5_gender.astype('category')

    df.year = df.year.astype('category')
    df.month = df.month.astype('category')
    df.day = df.day.astype('category')
    df.week = df.week.astype('category')
    df.summer_movie = df.summer_movie.astype('category')
    df.winter_movie = df.winter_movie.astype('category')
    
    shape_after = df.shape
    assert(shape_before[0] == shape_after[0])
    
    df.to_parquet('feats_dropped_cols.parquet')
    # Dummies
    df = pd.get_dummies(df, drop_first=True)
    col_map = {col : str(i) for col, i in zip(df.columns, range(len(df.columns)))}
    # Save files
    import json
    with open('col_map.json', 'w') as f:
        json.dump(col_map, f)
    df.rename(col_map, axis=1, inplace=True)
        
    train_df = df.iloc[:3000].copy()
    test_df = df.iloc[3000:].copy()
    assert( (train_df.columns == test_df.columns).all() )
    
    #train_df.drop(col_map['id'], axis=1, inplace=True)
    #test_df.drop(col_map['id'], axis=1, inplace=True)
    #test_df.drop(col_map['revenue'], axis=1, inplace=True)
    
    train_df.to_parquet('train_df.parquet', 'pyarrow')
    test_df.to_parquet('test_df.parquet', 'pyarrow')
    
    return df 

1. Crew features
2. Bin continous features
3. Log transform budget and revenue
4. More count features
5. Exploratory analysis

In [None]:
%time feats = create_features(full)

In [None]:
plot_information_content(feats)

In [None]:
%time feats_dummies = prepare_df(feats)

In [None]:
feats_dummies.shape

### Train with XGBoost

1. Train a model on all features
2. Select important features
3. Retrain model on only important features
4. Find optimal number of estimators by plotting learning curve
5. Retrain model on important features with optimal number of estimators

In [None]:
import pandas as pd

train = pd.read_parquet('train_df.parquet')
test = pd.read_parquet('test_df.parquet')
sample_submission = pd.read_csv('sample_submission.csv')

excluded = ['0', '4'] # id and revenue
predictors = [col for col in train.columns.tolist() if col not in excluded]
target = train['4']

In [None]:
import xgboost as xgb

xgbr = xgb.XGBRegressor(objective='reg:squarederror',
                        eval_metric='rmse',
                        n_jobs=-1,
                        random_state=42,
                        n_estimators=200)

xgbr.fit(train[predictors], target)

def rmsle(y, y0):
    import numpy as np
    return np.sqrt(np.mean(np.square(np.log1p(y) - np.log1p(y0))))

y_train_pred = xgbr.predict(train[predictors])

print('Train set error: ', rmsle(target, y_train_pred))

import json
with open('col_map.json') as f:
    col_map = json.load(f)
    
predictor_names = [k for k, v in col_map.items() if v not in excluded]
predictor_keys = [v for k, v in col_map.items() if v not in excluded]
weights = xgbr.feature_importances_

assert(len(predictor_names) == len(weights))

import pandas as pd
import numpy as np
feature_importances = pd.DataFrame(data=np.array([predictor_keys, weights]).T, index=predictor_names, columns=['key', 'importance'])
feature_importances.importance = feature_importances.importance.astype(np.float64)

fig, ax = plt.subplots(figsize=(6, 20))
feature_importances.query('importance > 0').sort_values('importance').plot(kind='barh', ax=ax)
ax.set_title('Feature importance')
ax.legend().set_visible(False)

useful_features = feature_importances.query("importance > 0").key
useful_features_keys = useful_features.values.tolist()

In [None]:
len(useful_features_keys)

In [None]:
def rmsle(y, y0):
    """
    Compute root mean squared logarithmic error.
    Basically a scaled version of rmse which penalizes large values.
    
    :params y: true label. type list or numpy array.
    :params y0: predicted label. type list or numpy array.
    
    :returns: rmsle. type float.
    """
    import numpy as np
    return np.sqrt(np.mean(np.square(np.log1p(y) - np.log1p(y0))))

def rmse(y, y0):
    """
    Compute root mean squared error.
    
    :params y: true label. type list or numpy array.
    :params y0: predicted label. type list or numpy array.
    
    :returns: rmse. type float.
    """
    import numpy as np
    return np.sqrt(np.mean(np.square(y - y0)))

def xgb_cv(train,
       predictors,
       target,
       test_size,
       n_trees=10,
       n_splits=10,
       seed=42,
       progressive=False,
       debug=False):
    
    """
    Perform k-fold cross validation with xgboost 
    
    :param train: training set. type pandas dataframe.
    :param predictors: independent variables. X. type list of col names contained in train.
    :param target: dependent variable. y. type pandas series or dataframe.
    :param test_size: test size split. type float in (0,1).
    :param n_trees: number of estimators in xgboost. type int in (1,inf).
    :param n_splits: number of splits in k-fold. type int (1,inf).
    :param seed: random state for reproducable random numbers. type int.
    :param progressive: train n_trees number of models and plot learning curve progressively or
    train just n_trees models distinctly. type bool.
    :param debug: verbosity level. type bool.
    
    :returns: train_scores: training scores for each split. type list.
    :returns: valid_scores: validaiton scores for each split. type list.
    """
    import xgboost as xgb
    import numpy as np
    from sklearn.model_selection import train_test_split, KFold

    X_train, X_test, y_train, y_test = train_test_split(train[predictors],
                                                        target,
                                                        test_size=test_size,
                                                        random_state=seed)
    
    if debug: 
        print('X_train shape: {}'.format(X_train.shape))
        print('X_test shape: {}'.format(X_test.shape))
        print('y_train shape: {}'.format(y_train.shape))
        print('y_test shape: {}'.format(y_test.shape))
        
    kf = KFold(n_splits=n_splits, random_state=seed)
    train_scores = []
    valid_scores = []
    test_scores = []
    
    if progressive:
        for trees in range(1, n_trees+1):
            xgbr = xgb.XGBRegressor(objective='reg:squarederror',
                        eval_metric='rmse',
                        n_jobs=-1,
                        n_estimators=trees,
                        random_state=seed)
            if debug: print(xgbr.n_estimators)
            train_scores_intra_fold = []
            valid_scores_intra_fold = []
            test_scores_intra_fold = []
            for train_idx, valid_idx in kf.split(X_train):
                xgbr.fit(X_train.iloc[train_idx, :], y_train.iloc[train_idx])
                train_scores_intra_fold.append(rmse(y_train.iloc[train_idx], 
                                               xgbr.predict(X_train.iloc[train_idx, :])))

                valid_scores_intra_fold.append(rmse(y_train.iloc[valid_idx], 
                                               xgbr.predict(X_train.iloc[valid_idx, :])))
                
                test_scores_intra_fold.append(rmse(y_test,
                                                    xgbr.predict(X_test)))
                
            train_scores.append(np.mean(train_scores_intra_fold))
            valid_scores.append(np.mean(valid_scores_intra_fold))
            test_scores.append(np.mean(test_scores_intra_fold))

    else:
        xgbr = xgb.XGBRegressor(objective='reg:squarederror',
                        eval_metric='rmse',
                        n_jobs=-1,
                        n_estimators=n_trees,
                        random_state=seed)
        for train_idx, valid_idx in kf.split(X_train):
            xgbr.fit(X_train.iloc[train_idx, :], y_train.iloc[train_idx])
            train_scores.append(rmse(y_train.iloc[train_idx], 
                                      xgbr.predict(X_train.iloc[train_idx, :])))

            valid_scores.append(rmse(y_train.iloc[valid_idx], 
                                      xgbr.predict(X_train.iloc[valid_idx, :])))

    
    return train_scores, valid_scores, test_scores, xgbr

def plot_performance(train_scores, valid_scores, test_scores, progressive=False):
       
    import matplotlib.pyplot as plt
    import numpy as np
    plt.style.use('seaborn')
    import seaborn as sns
    
    if progressive:
        figsize=(0.25*len(train_scores),5)
        fig, ax = plt.subplots(figsize=figsize)
        ax.set_xlabel('Number of estimators')
    else: 
        figsize = (8,3)
        fig, ax = plt.subplots(figsize=figsize)
        ax.set_xlabel('Cross validation fold')
    
    ax.errorbar(x=range(1, len(train_scores)+1),
                y=train_scores,
                yerr=None,
                linestyle='--',
                linewidth=1,
                #marker='o',
                label='train {:.2f} +- {:.2f}'.format(np.mean(train_scores),
                                                        np.std(train_scores)))
    
    ax.errorbar(x=range(1, len(valid_scores)+1),
            y=valid_scores,
            yerr=None,
            linestyle='-',
            linewidth=1,
            #marker='o',
            label='valid {:.2f} +- {:.2f}'.format(np.mean(valid_scores),
                                                  np.std(valid_scores)))
    
    ax.errorbar(x=range(1, len(test_scores)+1),
        y=test_scores,
        yerr=None,
        linestyle='-',
        linewidth=1,
        #marker='o',
        label='test {:.2f} +- {:.2f}'.format(np.mean(test_scores),
                                              np.std(test_scores)))
    
    ax.set_title('Learning Curve')
    ax.set_xticks(range(1, len(train_scores)+1))
    ax.legend()
    plt.tight_layout()

In [None]:
train_scores, valid_scores, test_scores, model = \
xgb_cv(train, useful_features_keys, target, test_size=0.3, n_trees=50, progressive=True, n_splits=5, debug=False)

In [None]:
plot_performance(train_scores, valid_scores, test_scores, progressive=True)

In [None]:
from sklearn.model_selection import train_test_split
test_size=0.001
X_train, X_test, y_train, y_test = train_test_split(train[predictors],
                                                    target,
                                                    test_size=test_size,
                                                    random_state=42)
n_trees = 1
xgbr = xgb.XGBRegressor(objective='reg:squarederror',
                eval_metric='rmse',
                n_jobs=-1,
                n_estimators=n_trees,
                random_state=42,
                max_depth=5,
                verbose=True)

xgbr.fit(X_train, y_train)

y_train_pred = xgbr.predict(X_train)
y_test_pred = xgbr.predict(X_test)

print('Train set error: ', rmse(y_train, y_train_pred))
print('Test set error: ', rmse(y_test, y_test_pred))

## Submit to Kaggle

In [None]:
final_test_pred = xgbr.predict(test[predictors])
sample_submission['revenue'] = final_test_pred
sample_submission['revenue'] = np.expm1(sample_submission.revenue)
sample_submission.to_csv('submission25.csv', index=False)

!kaggle competitions submit -c tmdb-box-office-prediction -f submission25.csv -m ""

In [None]:
!kaggle competitions submissions tmdb-box-office-prediction

In [None]:
import lightgbm

In [None]:
import shap
shap.initjs()
explainer = shap.TreeExplainer(xgbr)
shap_values = explainer.shap_values(X_train)

In [None]:
shap.summary_plot(shap_values, X_train)

In [None]:
feats.info()

### Train on LightGBM

In [None]:
import lightgbm as lgb
import pandas as pd
import numpy as np

In [None]:
train = pd.read_parquet('train_df.parquet')
test = pd.read_parquet('test_df.parquet')
sample_submission = pd.read_csv('sample_submission.csv')
excluded = ['0', '4']
predictors = [col for col in train.columns if col not in excluded]
target = train['4']

train_data = lgb.Dataset(data=train[predictors], label=target)
test_data = lgb.Dataset(data=test[predictors])

In [None]:
param = {
    'objective':'regression',
    'boosting':'dart',
    'metric':'rmse',
    'n_jobs':-1,
    'random_state':42,
    'num_trees':200,
    'max_depth':3,
    'verbosity':2
}

model = lgb.train(param, train_data)
#lgbm_model = lgb.cv(param, train_data, nfold=5, stratified=False)

In [None]:
print('Train set error: ', rmse(target, model.predict(train[predictors])))
#print('Test set error: ', rmse(y_test, y_test_pred))

In [None]:
final_test_pred = model.predict(test[predictors])

sample_submission['revenue'] = final_test_pred
sample_submission['revenue'] = np.expm1(sample_submission.revenue)
sample_submission.to_csv('submission25.csv', index=False)

!kaggle competitions submit -c tmdb-box-office-prediction -f submission25.csv -m ""

In [None]:
!kaggle competitions submissions tmdb-box-office-prediction

## Cross validation with Xgboost and GridSearch

In [None]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
import pandas as pd

train = pd.read_parquet('train_df.parquet')
test = pd.read_parquet('test_df.parquet')
sample_submission = pd.read_csv('sample_submission.csv')
excluded = ['0', '4']
predictors = [col for col in train.columns if col not in excluded]
target = train['4']

params = {
        'ntrees': [25, 50, 75, 100, 125, 130, 135, 140, 145, 150, 155, 175, 200, 250, 300],
        'max_depth' : [3, 4, 5, 8, 7, 10, 9, 11, 13, 15, 17, 19, 21]
        }


xgbr = xgb.XGBRegressor(objective='reg:squarederror',
                eval_metric='rmse',
                n_jobs=-1,
                random_state=42,
                verbose=True)

clf = GridSearchCV(xgbr,
                   params,
                   cv=3,
                   n_jobs=1,
                   verbose=3)

clf.fit(train[predictors], target)

In [3]:
clf

GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                    colsample_bylevel=1, colsample_bynode=1,
                                    colsample_bytree=1, eval_metric='rmse',
                                    gamma=0, importance_type='gain',
                                    learning_rate=0.1, max_delta_step=0,
                                    max_depth=3, min_child_weight=1,
                                    missing=None, n_estimators=100, n_jobs=-1,
                                    nthread=None, objective='reg:squ...
                                    random_state=42, reg_alpha=0, reg_lambda=1,
                                    scale_pos_weight=1, seed=None, silent=None,
                                    subsample=1, verbose=True, verbosity=1),
             iid='warn', n_jobs=1,
             param_grid={'max_depth': [3, 4, 5, 8, 7, 10, 9, 11, 13, 15, 17, 19,
          

In [11]:
clf.verbose

3