In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/catfitmodel/catfit
/kaggle/input/playground-series-s4e12/sample_submission.csv
/kaggle/input/playground-series-s4e12/train.csv
/kaggle/input/playground-series-s4e12/test.csv


In [2]:
#!pip install -U feature-engine
#!pip install optuna-integration[catboost]   

In [3]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import optuna
import catboost
#from optuna.integration import CatBoostPruningCallback
from optuna.samplers import TPESampler
from catboost.utils import eval_metric
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PowerTransformer

In [4]:
#Optuna objective function
def objective(trial):
    params = {
        #"iterations": 5000, 5000 is taking more than 12 hours    # OPtuna example not specify iteration
        'iterations': trial.suggest_int('iterations', 1000, 4000), 
        'early_stopping_rounds': trial.suggest_int('early_stopping_rounds', 100, 400), 
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1),
        "depth": trial.suggest_int("depth", 1, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10),
        #"subsample": trial.suggest_float("subsample", 0.05, 1.0),  subsample only support bernoulli bootstrap_type
        #"colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.05, 1.0),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 2, 100),
        'grow_policy': trial.suggest_categorical('grow_policy', ['SymmetricTree','Depthwise', 'Lossguide']),
         "bootstrap_type": trial.suggest_categorical(
            "bootstrap_type", ["Bayesian", "Bernoulli"] #MVS not compat with GPU
        ),
        "loss_function": "RMSE",
    }
    if params["bootstrap_type"] == "Bayesian":
        params["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif params["bootstrap_type"] == "Bernoulli":
        params["subsample"] = trial.suggest_float("subsample", 0.05, 1, log=True)

    

    model = CatBoostRegressor(**params, task_type="GPU",silent=True,random_seed=51,)
    
    #pruning_callback = CatBoostPruningCallback(trial, "RMSE") only with CPU

    model.fit(train_pool,verbose=0,
              eval_set=val_pool, 
              #callbacks=[pruning_callback],
             )
    y_pred = model.predict(val_pool)
    #rmse = mean_squared_error(y_val, predictions, squared=False)
    return eval_metric(val_pool.get_label(),y_pred,'RMSE')

In [5]:
df_train = pd.read_csv('/kaggle/input/playground-series-s4e12/train.csv')
df_test = pd.read_csv('/kaggle/input/playground-series-s4e12/test.csv')

In [6]:
#df_train['StartYM']=pd.to_datetime(df_train['Policy Start Date']).dt.strftime('%Y-%m')
df_train['StartY']=pd.to_datetime(df_train['Policy Start Date']).dt.strftime('%Y-%m')
df_test['StartY']=pd.to_datetime(df_test['Policy Start Date']).dt.strftime('%Y-%m')

In [7]:
df_train['StartY'].value_counts()

StartY
2022-05    22166
2023-03    21624
2022-08    21532
2021-07    21404
2022-11    21396
           ...  
2019-12    18581
2024-06    18210
2019-11    17889
2019-08     8868
2024-08     8289
Name: count, Length: 61, dtype: int64

In [8]:
df_train['StartY'].value_counts().reset_index().sort_values(by=['StartY'],ascending=False)

Unnamed: 0,StartY,count
60,2024-08,8289
55,2024-07,18798
57,2024-06,18210
5,2024-05,21251
46,2024-04,19389
...,...,...
56,2019-12,18581
58,2019-11,17889
47,2019-10,19355
42,2019-09,19596


In [9]:
df_train[df_train['Premium Amount']<120].groupby('StartY')['Premium Amount'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
StartY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2019-08,750.0,55.880000,33.038013,20.0,28.0,40.0,88.00,119.0
2019-09,1728.0,54.236111,32.662809,20.0,28.0,39.0,85.00,119.0
2019-10,1628.0,53.222973,32.727574,20.0,27.0,38.0,81.25,119.0
2019-11,1479.0,54.600406,33.092735,20.0,27.0,39.0,86.50,119.0
2019-12,1646.0,55.174362,32.894424,20.0,28.0,39.0,86.75,119.0
...,...,...,...,...,...,...,...,...
2024-04,1723.0,58.331979,33.660618,20.0,29.0,45.0,94.00,119.0
2024-05,1850.0,58.808108,33.599484,20.0,28.0,45.0,98.00,119.0
2024-06,1564.0,56.691176,33.541466,20.0,28.0,42.0,93.00,119.0
2024-07,1622.0,56.519112,32.745376,20.0,29.0,43.0,87.00,119.0


In [10]:
df_test['Annual Income'].describe()

count    770140.000000
mean      32803.871471
std       32201.063749
min           2.000000
25%        8048.000000
50%       23981.000000
75%       44660.000000
max      149997.000000
Name: Annual Income, dtype: float64

In [11]:
df_train[df_train['Annual Income']<200]['Credit Score'].value_counts()

Credit Score
673.0    26
658.0    26
711.0    25
613.0    25
445.0    24
         ..
428.0     1
350.0     1
335.0     1
322.0     1
405.0     1
Name: count, Length: 539, dtype: int64

In [12]:
df_test[df_test['Annual Income']<200]['Annual Income'].value_counts()

Annual Income
24.0     294
35.0     278
34.0     193
17.0     155
26.0     155
        ... 
73.0       1
150.0      1
87.0       1
198.0      1
146.0      1
Name: count, Length: 184, dtype: int64

In [13]:
#only Age, Anuual Income,Credit Score are number. The rest are category
numeric_columns=df_train.select_dtypes(include=np.number).columns
numeric_columns

Index(['id', 'Age', 'Annual Income', 'Number of Dependents', 'Health Score',
       'Previous Claims', 'Vehicle Age', 'Credit Score', 'Insurance Duration',
       'Premium Amount'],
      dtype='object')

In [14]:
object_columns=df_train.select_dtypes(include=object).columns
object_columns

Index(['Gender', 'Marital Status', 'Education Level', 'Occupation', 'Location',
       'Policy Type', 'Policy Start Date', 'Customer Feedback',
       'Smoking Status', 'Exercise Frequency', 'Property Type', 'StartY'],
      dtype='object')

In [15]:
#transform target and income
df_train['Label']=np.log1p(df_train['Premium Amount'])
df_train['Label'].describe()

count    1.200000e+06
mean     6.593889e+00
std      1.095825e+00
min      3.044522e+00
25%      6.244167e+00
50%      6.771936e+00
75%      7.319865e+00
max      8.517193e+00
Name: Label, dtype: float64

In [16]:
df_train['Annual Income']=np.log1p(df_train['Annual Income'])

In [17]:
df_test['Annual Income']=np.log1p(df_test['Annual Income'])

In [18]:
#Check Na credit score and premium amount
# There a lot of missing score with a lot of premium no different with having creditscor
df_train[df_train['Credit Score'].notna()].groupby('Premium Amount').size().reset_index(name='count').assign(total=lambda x:x['Premium Amount']*x['count']).groupby('count')['total'].sum()

count
1        720797.0
2       1155398.0
3       1264602.0
4       1489848.0
5       2094605.0
          ...    
3009      84252.0
3122      71806.0
3396      67920.0
3444      82656.0
3735      93375.0
Name: total, Length: 917, dtype: float64

In [19]:
#from feature_engine.transformation import YeoJohnsonTransformer

#tf = YeoJohnsonTransformer(variables = ['Annual Income', 'Label'])

#tf.fit(df_train.dropna())
#tf.lambda_dict_

In [20]:
#df_train_tf=tf.transform(df_train.dropna())

In [21]:
df_train[df_train['Premium Amount']<120].loc[:,['Annual Income','Credit Score']].sort_values(ascending=True,by=['Annual Income']).head()

Unnamed: 0,Annual Income,Credit Score
470076,1.098612,332.0
145069,2.079442,462.0
120276,2.484907,479.0
1187366,2.70805,682.0
1105989,2.833213,466.0


In [22]:
df_train[df_train['Premium Amount']<120]['Credit Score'].describe()

count    91651.000000
mean       614.447764
std        140.831463
min        300.000000
25%        505.000000
50%        623.000000
75%        733.000000
max        849.000000
Name: Credit Score, dtype: float64

In [23]:
df_train['Credit Score'].describe()

count    1.062118e+06
mean     5.929244e+02
std      1.499819e+02
min      3.000000e+02
25%      4.680000e+02
50%      5.950000e+02
75%      7.210000e+02
max      8.490000e+02
Name: Credit Score, dtype: float64

In [24]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200000 entries, 0 to 1199999
Data columns (total 23 columns):
 #   Column                Non-Null Count    Dtype  
---  ------                --------------    -----  
 0   id                    1200000 non-null  int64  
 1   Age                   1181295 non-null  float64
 2   Gender                1200000 non-null  object 
 3   Annual Income         1155051 non-null  float64
 4   Marital Status        1181471 non-null  object 
 5   Number of Dependents  1090328 non-null  float64
 6   Education Level       1200000 non-null  object 
 7   Occupation            841925 non-null   object 
 8   Health Score          1125924 non-null  float64
 9   Location              1200000 non-null  object 
 10  Policy Type           1200000 non-null  object 
 11  Previous Claims       835971 non-null   float64
 12  Vehicle Age           1199994 non-null  float64
 13  Credit Score          1062118 non-null  float64
 14  Insurance Duration    1199999 non-

In [25]:
# change fillna from mean to mode to get less effect from outlier
# Ana=df_train['Age'].mode()[0]
# Aina=df_train['Annual Income'].mode()[0]
# Ndna=0
# Hsna=df_train['Health Score'].mode()[0]
# Pcna=0
# Vana=0 #just impute with 0
# Csna=df_train['Credit Score'].mode()[0]
# Idna=0

In [26]:
# df_train['Age'].fillna(Ana,inplace=True)
# df_train['Annual Income'].fillna(Aina,inplace=True)
# df_train['Number of Dependents'].fillna(Ndna,inplace=True)
# df_train['Health Score'].fillna(Hsna,inplace=True)
# df_train['Previous Claims'].fillna(Pcna,inplace=True)
# df_train['Vehicle Age'].fillna(Vana,inplace=True)
# df_train['Credit Score'].fillna(Csna,inplace=True)
# df_train['Insurance Duration'].fillna(Idna,inplace=True)


#Ana=df_test['Age'].mean()
#Aina=df_test['Annual Income'].mean()
#Ndna=0
#Hsna=df_test['Health Score'].mean()
#Pcna=0
#Vana=0 #just impute with 0
#Csna=df_test['Credit Score'].mean()
#Idna=0

# df_test['Age'].fillna(Ana,inplace=True)
# df_test['Annual Income'].fillna(Aina,inplace=True)
# df_test['Number of Dependents'].fillna(Ndna,inplace=True)
# df_test['Health Score'].fillna(Hsna,inplace=True)
# df_test['Previous Claims'].fillna(Pcna,inplace=True)
# df_test['Vehicle Age'].fillna(Vana,inplace=True)
# df_test['Credit Score'].fillna(Csna,inplace=True)
# df_test['Insurance Duration'].fillna(Idna,inplace=True)



In [27]:
# import numpy as np
# from catboost import CatBoostRegressor
# # Initialize data

# train_data = [[1., 4., 5., np.nan],
#               [4., 5., 6., 7.],
#               [30., 40., 50., 60.]]

# eval_data = [[2., 4., 6., 8.],
#              [1., 4., 50., 60.]]

# train_labels = [10., 20., 30.]
# # Initialize CatBoostRegressor
# model = CatBoostRegressor(iterations=2,
#                           learning_rate=1,
#                           depth=2)
# # Fit model
# model.fit(train_data, train_labels)
# # Get predictions
# preds = model.predict(eval_data)

In [28]:
df_train[df_train['Age'].isna()].index

Index([     83,     309,     310,     332,     375,     385,     403,     538,
           553,     570,
       ...
       1199635, 1199653, 1199690, 1199701, 1199704, 1199752, 1199772, 1199784,
       1199893, 1199897],
      dtype='int64', length=18705)

In [29]:
df_train['Occupation'].fillna('Unknown',inplace=True)
df_train['Marital Status'].fillna('Unknown',inplace=True)
df_train['Customer Feedback'].fillna('Unknown',inplace=True)

df_test['Occupation'].fillna('Unknown',inplace=True)
df_test['Marital Status'].fillna('Unknown',inplace=True)
df_test['Customer Feedback'].fillna('Unknown',inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train['Occupation'].fillna('Unknown',inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train['Marital Status'].fillna('Unknown',inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we 

In [30]:
#cat_features=numeric_columns.drop(['id','Age','Annual Income','Credit Score','Health Score','Premium Amount']).union(object_columns)
cat_features=object_columns
cat_features.to_list()


['Gender',
 'Marital Status',
 'Education Level',
 'Occupation',
 'Location',
 'Policy Type',
 'Policy Start Date',
 'Customer Feedback',
 'Smoking Status',
 'Exercise Frequency',
 'Property Type',
 'StartY']

In [31]:
#change cat columns datatype to astype int
df_train[cat_features.intersection(numeric_columns)]=df_train[cat_features.intersection(numeric_columns)].astype('Int64')

#text_features = cat_features.remove('Vehicle Age')

In [32]:
#change cat columns datatype to int astype
df_test[cat_features.intersection(numeric_columns)]=df_test[cat_features.intersection(numeric_columns)].astype('Int64')

#text_features = cat_features.remove('Vehicle Age')

In [33]:
#custom loss function but will not be able to optimization with it so better transform target instead
#def rmsle_loss(y_true, y_pred):
#    return np.sqrt(np.mean((np.log1p(y_pred) - np.log1p(y_true)) ** 2))
#Also use exp to transform the prediction
#predictions = np.expm1(model.predict(X_test))

In [34]:
#df_train.drop(['id','Annual Income','Customer Feedback','Exercise Frequency','Premium Amount','group Age','Label'],axis=1).columns

In [35]:
#df_train2 = pd.read_csv('/kaggle/input/playground-series-s4e12/train.csv')
#df_train2=df_train2.dropna()


In [36]:
#nonadata = pd.concat([df_train2['Credit Score'],df_train2['Health Score']],axis=1)
#nonadata.head()

In [37]:
#nonadata.describe()
#from sklearn.preprocessing import StandardScaler, RobustScaler

#standard_scaler=StandardScaler()
#scalenonadata=standard_scaler.fit_transform(nonadata)

#robust_scaler=RobustScaler()
#scalenonadata=robust_scaler.fit_transform(nonadata)

In [38]:
#nonadata=pd.DataFrame(scalenonadata,columns=['Credit Score','Health Score'])

In [39]:
#df_train2['Label'] = np.log1p(df_train2['Premium Amount'])

In [40]:
data = df_train.drop(['id','Premium Amount','Policy Start Date','Label'],axis=1) 
cat_features=cat_features.intersection(data.columns)
cat_features=cat_features.difference(['Credit Score','Health Score','Annual Income'])
#text_features=text_features.intersection(data.columns)

In [41]:
cat_features

Index(['Customer Feedback', 'Education Level', 'Exercise Frequency', 'Gender',
       'Location', 'Marital Status', 'Occupation', 'Policy Type',
       'Property Type', 'Smoking Status', 'StartY'],
      dtype='object')

In [42]:
data.columns

Index(['Age', 'Gender', 'Annual Income', 'Marital Status',
       'Number of Dependents', 'Education Level', 'Occupation', 'Health Score',
       'Location', 'Policy Type', 'Previous Claims', 'Vehicle Age',
       'Credit Score', 'Insurance Duration', 'Customer Feedback',
       'Smoking Status', 'Exercise Frequency', 'Property Type', 'StartY'],
      dtype='object')

In [43]:
numeric_columns

Index(['id', 'Age', 'Annual Income', 'Number of Dependents', 'Health Score',
       'Previous Claims', 'Vehicle Age', 'Credit Score', 'Insurance Duration',
       'Premium Amount'],
      dtype='object')

In [44]:
df_train[cat_features]=df_train[cat_features].astype('category')
df_test[cat_features]=df_test[cat_features].astype('category')

In [45]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200000 entries, 0 to 1199999
Data columns (total 23 columns):
 #   Column                Non-Null Count    Dtype   
---  ------                --------------    -----   
 0   id                    1200000 non-null  int64   
 1   Age                   1181295 non-null  float64 
 2   Gender                1200000 non-null  category
 3   Annual Income         1155051 non-null  float64 
 4   Marital Status        1200000 non-null  category
 5   Number of Dependents  1090328 non-null  float64 
 6   Education Level       1200000 non-null  category
 7   Occupation            1200000 non-null  category
 8   Health Score          1125924 non-null  float64 
 9   Location              1200000 non-null  category
 10  Policy Type           1200000 non-null  category
 11  Previous Claims       835971 non-null   float64 
 12  Vehicle Age           1199994 non-null  float64 
 13  Credit Score          1062118 non-null  float64 
 14  Insurance Duration

In [46]:
df_train['Credit Score'].head()

0    372.0
1    694.0
2      NaN
3    367.0
4    598.0
Name: Credit Score, dtype: float64

In [47]:
pt=PowerTransformer(method='yeo-johnson',copy=False)
pt_columns=df_train.select_dtypes(include=['float64']).drop(['Label'],axis=1).columns
df_train[pt_columns]=pt.fit_transform(df_train[pt_columns])


In [48]:
from catboost import *

X_train,x_val,Y_train,y_val = train_test_split(df_train.drop(['id','Premium Amount','Policy Start Date','Label'],axis=1),df_train['Label'],test_size=0.2,random_state=11)
train_pool=Pool(data=X_train,
                label=Y_train,
                cat_features=cat_features.to_list())
val_pool=Pool(data=x_val,
               label=y_val,
               cat_features=cat_features.to_list())
X_test = df_test.drop(['id','Policy Start Date'],axis=1)


In [49]:
pt_columns=X_test.select_dtypes(include=['float']).columns

In [50]:
X_test[pt_columns]=pt.fit_transform(X_test[pt_columns])

In [51]:
def calc_test_quality(train_pool, val_pool, **kwargs):
    model = CatBoostRegressor(**kwargs, task_type="GPU",random_seed=11)
    model.fit(train_pool, verbose=0, eval_set=val_pool)
    y_pred = model.predict(val_pool)
    return eval_metric(val_pool.get_label(), y_pred, 'RMSE'), model

In [52]:
#Average regression value from 10 random state
def bagging_prob(train_pool, val_pool, **kwargs):
    predictions=[]
    for i in range(1,11):
        model = CatBoostRegressor(**kwargs, task_type="GPU",random_seed=i)
        model.fit(train_pool, verbose=0, eval_set=val_pool)
        predictions.append(model.predict(X_test))
    return np.mean(predictions,axis=0) 

In [53]:
from optuna.samplers import TPESampler

sampler = TPESampler(seed=777)
study = optuna.create_study(
        #pruner=optuna.pruners.MedianPruner(n_startup_trials=5,n_warmup_steps=1000), Not with GPU 
        direction='minimize', 
        sampler=sampler)
study.optimize(objective, n_trials=100)

[I 2024-12-30 15:22:49,162] A new study created in memory with name: no-name-ec5dd8e4-507f-4907-b9cf-2a793222eaba
[I 2024-12-30 15:23:14,282] Trial 0 finished with value: 1.0498105713745032 and parameters: {'iterations': 1458, 'early_stopping_rounds': 191, 'learning_rate': 0.007141605056741639, 'depth': 5, 'l2_leaf_reg': 8.517280459938654, 'min_data_in_leaf': 93, 'grow_policy': 'Depthwise', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 0.7968588561148293}. Best is trial 0 with value: 1.0498105713745032.
[I 2024-12-30 15:23:31,037] Trial 1 finished with value: 1.048567788314879 and parameters: {'iterations': 2769, 'early_stopping_rounds': 203, 'learning_rate': 0.09889873870870279, 'depth': 7, 'l2_leaf_reg': 7.136013518028578, 'min_data_in_leaf': 56, 'grow_policy': 'Depthwise', 'bootstrap_type': 'Bernoulli', 'subsample': 0.08918259150432425}. Best is trial 1 with value: 1.048567788314879.
[I 2024-12-30 15:24:03,482] Trial 2 finished with value: 1.051483194355997 and parameters: {'

In [54]:
print(study.best_params)

{'iterations': 3890, 'early_stopping_rounds': 386, 'learning_rate': 0.0028562539247123375, 'depth': 10, 'l2_leaf_reg': 1.692766930537637, 'min_data_in_leaf': 39, 'grow_policy': 'Depthwise', 'bootstrap_type': 'Bernoulli', 'subsample': 0.6936710805981859}


In [55]:
# best_params={'iterations': 3993, 
#              'early_stopping_rounds': 196,
#              'learning_rate': 0.04388555358504515,
#              'depth': 10,
#              'l2_leaf_reg': 4.406115225787097,
#              'min_data_in_leaf': 69,
#              'grow_policy': 'Lossguide'}

#best_params={'iterations': 1719,
#             'early_stopping_rounds': 62,
#             'learning_rate': 0.022712635681532305,
#             'depth': 10, 'l2_leaf_reg': 2.266827548044181,
#             'min_data_in_leaf': 66,
#             'grow_policy': 'Depthwise'}
#metriclist, model = calc_test_quality(train_pool,val_pool,**best_params)
#model.save_model('optuna_tune_model')

In [56]:
#print(metriclist)

In [57]:
#predicts = model.predict(X_test)
predicts = bagging_prob(train_pool,val_pool,**study.best_params)

In [58]:
predicts_original_scale = np.expm1(predicts)

In [59]:
temp=pd.concat([df_test['id'],pd.DataFrame(predicts_original_scale,columns=['Premium Amount'])],axis=1).reset_index(drop=True)
temp.to_csv('submissionfloat19.csv',index=False)

In [60]:
#from IPython.display import FileLink

# Provide a link to the file in the notebook
#FileLink("subfloat2.zip")


In [61]:
#round int
#temp=pd.concat([df_test['id'],pd.DataFrame(np.rint(predicts_original_scale),columns=['Premium Amount'])],axis=1).reset_index(drop=True)

In [62]:
#not round int
#temp2=pd.concat([df_test['id'],pd.DataFrame(predicts_original_scale.astype(int),columns=['Premium Amount'])],axis=1).reset_index(drop=True)

In [63]:
#temp.to_csv('submissionX.csv',index=False)

In [64]:
#temp2.to_csv('submission_not_round.csv',index=False)

In [65]:
!rm subnotround.zip
!zip subnotround.zip submission_not_round.csv


rm: cannot remove 'subnotround.zip': No such file or directory

zip error: Nothing to do! (subnotround.zip)


In [66]:
!rm sub.zip
!zip sub.zip submissionX.csv

rm: cannot remove 'sub.zip': No such file or directory

zip error: Nothing to do! (sub.zip)


In [67]:
!ls -al *.zip

ls: cannot access '*.zip': No such file or directory
