In [1]:
import os

import pandas as pd
import numpy as np
from PIL import Image

from sklearn.linear_model import LogisticRegression

In [2]:
input_path = os.path.join(os.getcwd(), 'input')

train_data = pd.read_csv(os.path.join(input_path, 'train_v2.csv'))

print(train_data.shape)
train_data.head()

(40479, 2)


Unnamed: 0,image_name,tags
0,train_0,haze primary
1,train_1,agriculture clear primary water
2,train_2,clear primary
3,train_3,clear primary
4,train_4,agriculture clear habitation primary road


In [6]:
image_path = os.path.join(os.getcwd(), 'input', 'train-jpg')
image_arrays = []

for i, filename in enumerate(os.listdir(image_path)):
    with Image.open(os.path.join(image_path, filename)) as temp_file:
        image_arrays.append(np.array(temp_file)[:,:,:3])

In [7]:
r_mean = []
g_mean = []
b_mean = []

for image_array in image_arrays:
    r = image_array[:,:,0].ravel()
    g = image_array[:,:,1].ravel()
    b = image_array[:,:,2].ravel()
    
    r_mean.append(np.mean(r))
    g_mean.append(np.mean(g))
    b_mean.append(np.mean(b))

In [8]:
# train_data = train_data[:100].copy()

train_data['r_mean'] = pd.Series(r_mean)
train_data['g_mean'] = pd.Series(g_mean)
train_data['b_mean'] = pd.Series(b_mean)

print(train_data.shape)
train_data.head()

(40479, 5)


Unnamed: 0,image_name,tags,r_mean,g_mean,b_mean
0,train_0,haze primary,163.212616,145.653,151.390854
1,train_1,agriculture clear primary water,189.360138,178.502625,190.69841
2,train_2,clear primary,199.677139,197.728287,204.278961
3,train_3,clear primary,157.16095,148.410355,146.096863
4,train_4,agriculture clear habitation primary road,131.169815,120.781433,128.889297


In [9]:
is_primary = [1 if 'primary' in tag else 0 for tag in train_data.tags]

train_data['is_primary'] = pd.Series(is_primary)
train_data.tail()

Unnamed: 0,image_name,tags,r_mean,g_mean,b_mean,is_primary
40474,train_40474,clear primary,192.044098,182.328033,191.455551,1
40475,train_40475,cloudy,155.071167,136.610947,144.972229,0
40476,train_40476,agriculture clear primary,198.980667,184.776413,190.802536,1
40477,train_40477,agriculture clear primary road,206.233414,200.593918,208.428513,1
40478,train_40478,agriculture cultivation partly_cloudy primary,196.645172,194.77475,204.354675,1


### Model

In [10]:
X = train_data[['r_mean', 'g_mean', 'b_mean']]
X.head()

Unnamed: 0,r_mean,g_mean,b_mean
0,163.212616,145.653,151.390854
1,189.360138,178.502625,190.69841
2,199.677139,197.728287,204.278961
3,157.16095,148.410355,146.096863
4,131.169815,120.781433,128.889297


In [11]:
y = train_data['is_primary']
y.head()

0    1
1    1
2    1
3    1
4    1
Name: is_primary, dtype: int64

In [14]:
split = int(len(X)*.8)
X = np.array(X)
X_train = X[:split]
X_test = X[split:]

In [15]:
y = np.array(y)
y_train = y[:split]
y_test = y[split:]

In [16]:
lr = LogisticRegression(solver='lbfgs')

In [17]:
lr = lr.fit(X_train, y_train)

In [18]:
lr.score(X_test, y_test)

0.9307065217391305

In [19]:
for i, X_i, in enumerate(X_test[:100]):
    print(lr.predict(X_i.reshape(1, -1)), y_test[i])

[1] 1
[1] 1
[1] 0
[1] 0
[1] 0
[1] 1
[1] 1
[1] 1
[1] 1
[1] 1
[1] 1
[1] 1
[1] 1
[1] 1
[1] 1
[1] 1
[1] 1
[1] 1
[1] 1
[1] 1
[1] 1
[1] 1
[1] 1
[1] 1
[1] 1
[1] 0
[1] 1
[1] 1
[1] 1
[1] 1
[1] 1
[1] 1
[1] 1
[1] 1
[1] 1
[1] 1
[1] 1
[1] 1
[1] 1
[1] 1
[1] 1
[1] 1
[1] 1
[1] 1
[1] 1
[1] 1
[1] 1
[1] 1
[1] 0
[1] 1
[1] 0
[1] 1
[1] 1
[1] 1
[1] 1
[1] 1
[1] 1
[1] 1
[1] 1
[1] 1
[1] 1
[1] 1
[1] 1
[1] 1
[1] 1
[1] 1
[1] 1
[1] 1
[1] 1
[1] 1
[1] 1
[1] 1
[1] 1
[1] 1
[1] 0
[1] 1
[1] 1
[1] 1
[1] 1
[1] 1
[1] 1
[1] 1
[1] 1
[1] 1
[1] 0
[1] 1
[1] 1
[1] 1
[1] 1
[1] 1
[1] 1
[1] 1
[1] 1
[1] 1
[1] 1
[1] 1
[1] 1
[1] 1
[1] 1
[1] 1


In [20]:
np.sum(y)

37513

In [21]:
sum(lr.predict(X))

40479

#### Model is predicting all '1'