In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import StratifiedKFold
import lightgbm as lgbm

from tqdm import tqdm
from PIL import Image

from sklearn.metrics import accuracy_score

In [3]:
!pip install imagesize

In [4]:
import imagesize

In [5]:
tqdm.pandas()

In [6]:
ROOT_DIR='../input/happy-whale-and-dolphin'
TRAIN_DIR='../input/happy-whale-and-dolphin/train_images'
TEST_DIR='../input/happy-whale-and-dolphin/test_images'

In [7]:
def get_train_file_path(image_id):
    return f'{TRAIN_DIR}/{image_id}'
def get_test_file_path(image_id):
    return f'{TEST_DIR}/{image_id}'

In [8]:
train_df=pd.read_csv(os.path.join(ROOT_DIR,'train.csv'))
#test_df=pd.read_csv(os.path.join(ROOT_DIR,'sample_submission.csv'))

train_df

In [9]:
train_df['image_path'] = train_df['image'].apply(lambda filename: os.path.join(TRAIN_DIR + '/' + filename))
train_df.head()

In [10]:
train_df.loc[train_df.species.str.contains('beluga'), 'species'] = 'beluga_whale'
train_df.loc[train_df.species.str.contains('globis'), 'species'] = 'globis_whale'

In [11]:
train_df['species'] = train_df['species'].str.replace('bottlenose_dolpin', 'bottlenose_dolphin')
train_df['species'] = train_df['species'].str.replace('kiler_whale', 'killer_whale')

In [12]:
train_df['class']  = train_df.species.map(lambda x : 'whale' if 'whale' in x else 'dolphin')

In [13]:
def get_image_size(file_path):
    width, height = imagesize.get(file_path)
    train_df['width'] = width
    train_df['height'] = height

In [14]:
def create_shape_feature(df):
    width_height_list = []
    file_size_list = []
    for path_ in tqdm(df['image_path']):
        width_height_list.append(Image.open(path_).size)
        file_size_list.append(os.path.getsize(path_))
    df['width_height'] = width_height_list
    #print(width_height_list)
    df['file_size'] = file_size_list
    df['width'] = df['width_height'].apply(lambda x: x[0])
    df['height'] = df['width_height'].apply(lambda x: x[1])
    return df

In [16]:
train_df = create_shape_feature(train_df)
#test_df = create_shape_feature(test_df)

train_df

In [17]:
train_df['area'] = train_df['width']*train_df['height']

In [18]:
train_df['size_per_ pixel'] = train_df['file_size'] / train_df['area']
train_df.head()

In [19]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

In [20]:
train_df['species_label'] = le.fit_transform(train_df['species'])

In [21]:
print('species_label amount:',train_df['species_label'].nunique())
display(train_df)

In [22]:
skf = StratifiedKFold(n_splits=5)

In [23]:
for fold, (_, val_) in enumerate(skf.split(X=train_df, y=train_df['species_label'])):
    train_df.loc[val_, 'kfold'] = fold
train_df.head()    

In [24]:
train_df = train_df.drop('class', axis=1)
train_df.head()

In [25]:
params = {
    'learning_rate':0.01,
    "objective": "multiclass",
    'boosting_type': "gbdt",
    'verbosity': -1,
    'n_jobs': -1, 
    'seed': 42,
    'max_depth': 5,
    'n_estimators': 1000, 
}

In [26]:
for fold in range(5):
    train=train_df[train_df['kfold']!=fold]
    valid=train_df[train_df['kfold']==fold]

    X_train=train.drop(['image','species','individual_id','image_path','width_height','kfold','species_label'],axis=1)
    y_train=train['species_label']
    X_valid=valid.drop(['image','species','individual_id','image_path','width_height','kfold','species_label'],axis=1)
    y_valid=valid['species_label']

    model=lgbm.LGBMClassifier(**params)
    model.fit(X_train,y_train,eval_set=[(X_train,y_train),(X_valid,y_valid)],verbose=1000,early_stopping_rounds=15)
    pred=model.predict(X_valid)

In [None]:
#feature_importance
fi=model.feature_importances_

lgb_imp = pd.DataFrame()
lgb_imp['Image feature'] = X_train.columns
lgb_imp['importance'] = fi

plt.figure(figsize=(5,5))
sns.barplot(x="importance", y="Image feature",data=lgb_imp.sort_values(by="importance",ascending=False))
plt.title('LightGBM Features (avg over folds)')
plt.tight_layout()

In [None]:
#accuracy
acc = accuracy_score(y_valid,pred)
print('accuracy:',acc)