# NEXT STEPS

#### Weather data should have one column with tag name. Use multi-class classifier instead of 4 binary classifiers. Need to do post-processing to merge the results of our models. Put pre-processing steps that get repeated into functions in a separate script.

#### Once data flow is complete, work on improving the model. Include color normalization, play with different algorithms, open cv package, etc.

# PRE-PROCESSING

In [1]:
import os

import pandas as pd
import numpy as np
from PIL import Image

from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

In [2]:
input_path = os.path.join(os.getcwd(), 'input')

train_data = pd.read_csv(os.path.join(input_path, 'train_v2.csv'))
print("dataframe shape: ", train_data.shape)

tags = list(set([x for tag in train_data.tags for x in tag.split(" ")]))
print("number of tags: ", len(tags))

dataframe shape:  (40479, 2)
number of tags:  17


In [3]:
'''
load random sample of images
'''
# image_path = os.path.join(os.getcwd(), 'input', 'train-jpg')
# r = np.random.permutation(len(os.listdir(image_path)))
# random_sample = r[:500]

# train_files = [os.listdir(image_path)[r][:-4] for r in random_sample]
# train_data = train_data[train_data['image_name'].isin(train_files)]

'''
load all images (n = 1505)
'''
image_path = os.path.join(os.getcwd(), 'input', 'train-jpg')
train_files = [x[:-4] for x in os.listdir(image_path)]
train_data = train_data[train_data['image_name'].isin(train_files)]
print("dataframe shape: ", train_data.shape)

dataframe shape:  (1505, 2)


In [5]:
weather_tags = ['clear', 'haze', 'cloudy', 'partly_cloudy']
landscape_tags = [tag for tag in tags if tag not in weather_tags]

weather_train_data = train_data.copy()
landscape_train_data = train_data.copy()

for tag in tags:
    train_data[tag] = [1 if tag in x else 0 for x in train_data['tags']]

for tag in weather_tags:
    weather_train_data[tag] = [1 if tag in x else 0 for x in weather_train_data['tags']]

# cloudy gets captured when the tag is partly_cloudy
for i in train_data.index:
    if train_data.loc[i, 'partly_cloudy'] == 1:
        train_data.loc[i, 'cloudy'] = 0
for i in weather_train_data.index:
    if weather_train_data.loc[i, 'partly_cloudy'] == 1:
        weather_train_data.loc[i, 'cloudy'] = 0

for tag in landscape_tags:
    landscape_train_data[tag] = [1 if tag in x else 0 for x in landscape_train_data['tags']]
    
print("all dataframe shape: ", train_data.shape)
print("weather dataframe shape: ", weather_train_data.shape)
print("landscape_dataframe shape: ", landscape_train_data.shape)

all dataframe shape:  (1505, 19)
weather dataframe shape:  (1505, 6)
landscape_dataframe shape:  (1505, 15)


In [6]:
weather_train_data.head()

Unnamed: 0,image_name,tags,clear,haze,cloudy,partly_cloudy
0,train_0,haze primary,0,1,0,0
1,train_1,agriculture clear primary water,1,0,0,0
10,train_10,agriculture clear primary slash_burn water,1,0,0,0
11,train_11,clear primary water,1,0,0,0
100,train_100,bare_ground clear water,1,0,0,0


In [7]:
landscape_train_data.head()

Unnamed: 0,image_name,tags,bare_ground,road,conventional_mine,water,selective_logging,primary,slash_burn,cultivation,artisinal_mine,habitation,agriculture,blow_down,blooming
0,train_0,haze primary,0,0,0,0,0,1,0,0,0,0,0,0,0
1,train_1,agriculture clear primary water,0,0,0,1,0,1,0,0,0,0,1,0,0
10,train_10,agriculture clear primary slash_burn water,0,0,0,1,0,1,1,0,0,0,1,0,0
11,train_11,clear primary water,0,0,0,1,0,1,0,0,0,0,0,0,0
100,train_100,bare_ground clear water,1,0,0,1,0,0,0,0,0,0,0,0,0


In [8]:
'''
Drop images that don't have exactly 1 weather tag.
'''
weather_tag_count = pd.DataFrame(weather_train_data.sum(axis=1))
# weather_train_data.loc[weather_tag_count[weather_tag_count[0]!=1].index]
weather_train_data = weather_train_data.drop(weather_tag_count[weather_tag_count[0]!=1].index, axis=0)
landscape_train_data = landscape_train_data.drop(weather_tag_count[weather_tag_count[0]!=1].index, axis=0)

In [9]:
'''
Cloudy images do not have landscape data.
Remove from landscape_training_data only.
'''
landscape_tag_count = pd.DataFrame(landscape_train_data.sum(axis=1))
# landscape_tag_count[landscape_tag_count[0]<1]
# no_landscape = landscape_train_data.loc[landscape_tag_count[landscape_tag_count[0]<1].index]
# no_landscape[no_landscape['tags']!='cloudy'] # 1 partly_cloudy
# train_data[train_data['cloudy']==1]['tags'].min() == train_data[train_data['cloudy']==1]['tags'].max()
landscape_train_data = landscape_train_data.drop(landscape_tag_count[landscape_tag_count[0]<1].index, axis=0)

In [10]:
image_path = os.path.join(os.getcwd(), 'input', 'train-jpg')
train_files = [x[:-4] for x in os.listdir(image_path)]
train_data = train_data[train_data['image_name'].isin(train_files)]
print("dataframe shape: ", train_data.shape)

dataframe shape:  (1505, 19)


In [11]:
weather_images = []
file_names = [str(x)+'.jpg' for x in weather_train_data['image_name']]

for i, filename in enumerate(file_names):
    with Image.open(os.path.join(image_path, filename)) as temp_file:
        weather_images.append(np.array(temp_file)[:,:,:3])
print(weather_images[0].shape)
print(len(weather_images) == len(weather_train_data))

landscape_images = []
file_names = [str(x)+'.jpg' for x in landscape_train_data['image_name']]
for i, filename in enumerate(file_names):
    with Image.open(os.path.join(image_path, filename)) as temp_file:
        landscape_images.append(np.array(temp_file)[:,:,:3])
        
print(landscape_images[0].shape)
print(len(landscape_images) == len(landscape_train_data))

(256, 256, 3)
True
(256, 256, 3)
True


In [12]:
'''
Compacting logic:
Want to reduce the image arrays by taking the max value
in each 2x2 pixel square.

256x256 -> 128x128 -> 64x64

'''

def compact(array):
    row = 0
    col = 0
    maxes = []
    while row < array.shape[0]:
        col = 0
        while col < array.shape[1]:
#             print("row =", row, "col = ", col)
            maxes.append(max(array[row, col], array[row, col+1], array[row+1, col], array[row+1,col+1]))
#             print(z)
            col += 2
        row += 2
#     maxes = np.array(maxes).reshape([size,size])
    return maxes

In [13]:
# y = np.array([[1,2,3,4,21,22,23,24],[1,2,3,4,31,32,33,34],[1,2,3,4,21,22,23,24],[1,2,3,4,31,32,33,34],[1,2,3,4,21,22,23,24],[1,2,3,4,31,32,33,34],[1,2,3,4,21,22,23,24],[1,2,3,4,31,32,33,34]])
# x = compact(y)
# compact(np.array(x).reshape(4,4))

In [14]:
landscape_red = []
landscape_green = []
landscape_blue = []

for image in landscape_images:
    image_array = np.array(image)[:,:,:3] #256x256x3
    r = image_array[:,:,0] #256x256
    g = image_array[:,:,1] 
    b = image_array[:,:,2] 
    
    ## Compact function reduces arrays to 128x128. Run twice to get 64x64 arrays.
    landscape_red.append(compact(np.array(compact(r)).reshape(128,128))) #list of images (one row per image), 4096 (64x64) values each
    landscape_green.append(compact(np.array(compact(g)).reshape(128,128)))
    landscape_blue.append(compact(np.array(compact(b)).reshape(128,128)))
    
weather_red = []
weather_green = []
weather_blue = []

for image in weather_images:
    image_array = np.array(image)[:,:,:3] #256x256x3
    r = image_array[:,:,0] #256x256
    g = image_array[:,:,1] 
    b = image_array[:,:,2] 
    
    ## Compact function reduces arrays to 128x128. Run twice to get 64x64 arrays.
    weather_red.append(compact(np.array(compact(r)).reshape(128,128))) #list of images (one row per image), 4096 (64x64) values each
    weather_green.append(compact(np.array(compact(g)).reshape(128,128)))
    weather_blue.append(compact(np.array(compact(b)).reshape(128,128)))

In [23]:
'''
Build array from color lists created in last cell.
Can choose to use 1 color or all 3.
'''
# def colors(single_color=False, color1=None):
#     if single_color:
#         return np.array(color1)
#     else:
#         colors = np.concatenate((np.array(red), np.array(blue), np.array(green)), axis=1)
#         return colors

In [15]:
'''
Returns array of 1 row per image, columns represent color values for each pixels
'''
# data = colors(single_color=True, color1=green)
# data.shape

# data = colors()
# data.shape

weather_data = np.concatenate((np.array(weather_red), np.array(weather_blue), np.array(weather_green)), axis=1)
landscape_data = np.concatenate((np.array(landscape_red), np.array(landscape_blue), np.array(landscape_green)), axis=1)

In [10]:
'''
Further dimensionality reduction with PCA
'''
# pca = PCA(n_components=100)
# data_pca = pca.fit_transform(data)
# print(data_pca.shape)

'\nFurther dimensionality reduction with PCA\n'

# LANDSCAPE MODEL

In [25]:
def fit_model(image_data, labels, tag):
    X_train, X_test, y_train, y_test = train_test_split(image_data, np.array(labels[tag]), test_size=0.2, random_state=2)
    lr = LogisticRegression(solver='lbfgs')
    lr.fit(X_train, y_train)
    labels = lr.predict(X_test)
    print (str(tag) + ' f1 score = ' + str(f1_score(y_test, labels)))
    return y_test, labels

In [29]:
results = pd.DataFrame()

for tag in landscape_tags:
    actual, predicted = fit_model(landscape_data, landscape_train_data, tag)
    results['actual_' + str(tag)] = actual
    results['predicted_' + str(tag)] = predicted

results.head()

bare_ground f1 score = 0.0


  'recall', 'true', average, warn_for)


slash_burn f1 score = 0.0
blow_down f1 score = 0.0
conventional_mine f1 score = 0.0
selective_logging f1 score = 0.0
primary f1 score = 0.9781818181818183
cultivation f1 score = 0.1818181818181818
water f1 score = 0.22448979591836735
road f1 score = 0.39669421487603307


  'precision', 'predicted', average, warn_for)


artisinal_mine f1 score = 0.0
blooming f1 score = 0.0
agriculture f1 score = 0.5562130177514791
habitation f1 score = 0.24489795918367346


Unnamed: 0,actual_bare_ground,predicted_bare_ground,actual_slash_burn,predicted_slash_burn,actual_blow_down,predicted_blow_down,actual_conventional_mine,predicted_conventional_mine,actual_selective_logging,predicted_selective_logging,...,actual_road,predicted_road,actual_artisinal_mine,predicted_artisinal_mine,actual_blooming,predicted_blooming,actual_agriculture,predicted_agriculture,actual_habitation,predicted_habitation
0,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,1,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
results.sum()

actual_bare_ground               5
predicted_bare_ground           11
actual_slash_burn                0
predicted_slash_burn             4
actual_blow_down                 1
predicted_blow_down              4
actual_conventional_mine         0
predicted_conventional_mine      1
actual_selective_logging         3
predicted_selective_logging      8
actual_primary                 272
predicted_primary              278
actual_cultivation              25
predicted_cultivation           30
actual_water                    64
predicted_water                 34
actual_road                     73
predicted_road                  48
actual_artisinal_mine            4
predicted_artisinal_mine         0
actual_blooming                  2
predicted_blooming               5
actual_agriculture              89
predicted_agriculture           80
actual_habitation               26
predicted_habitation            23
dtype: int64

# WEATHER MODEL

In [31]:
results = pd.DataFrame()

for tag in weather_tags:
    actual, predicted = fit_model(weather_data, weather_train_data, tag)
    results['actual_' + str(tag)] = actual
    results['predicted_' + str(tag)] = predicted

results.head()

clear f1 score = 0.8205128205128205
haze f1 score = 0.0
cloudy f1 score = 0.22727272727272727
partly_cloudy f1 score = 0.29850746268656714


Unnamed: 0,actual_clear,predicted_clear,actual_haze,predicted_haze,actual_cloudy,predicted_cloudy,actual_partly_cloudy,predicted_partly_cloudy
0,1,1,0,0,0,0,0,0
1,1,0,0,0,0,0,0,1
2,1,1,0,0,0,0,0,0
3,1,1,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0


In [32]:
results.sum()

actual_clear               225
predicted_clear            243
actual_haze                 21
predicted_haze              11
actual_cloudy               16
predicted_cloudy            28
actual_partly_cloudy        39
predicted_partly_cloudy     28
dtype: int64

In [14]:
# nb = GaussianNB()
# nb.fit(X_train, y_train)
# labels = nb.predict(X_test)
# print ('Accuracy = ' + str(np.sum(labels == y_test)*1.0/len(y_test)))
# print('Prediction: ' + str(labels))
# print('    Actual: ' + str(y_test))

In [15]:
# print(classification_report(y_test, labels, target_names=['primary', 'other']))