In [13]:
%matplotlib inline
print(__doc__)
import glob, re, csv
import numpy as np
from scipy import ndimage
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import RandomizedPCA

Automatically created module for IPython interactive environment


In [14]:
class SolarArray:
    """Solar Array detection based on rotated images( quadrupled examples )"""
    
    X_path = "./data/rotate_images/*.tif"   # training pic path
    target_path = "./data/test_images/*.tif"    # testing pic path
    y_path = "./data/train_rotate_solution.csv"    # training solution path
    export_path = "./data/solar_array0.5.csv"    # path to export 
    
    X = ""    # training pic in row
    X_target = ""    # testing pic in row
    y = ""    # traing pic solution

    
    def __init__(self):
        """init the class"""
#         self.X = get_row_pic(self.X_path)
#         self.X_target = get_row_pic(self.target_path)
#         self.y = read_solution(self.y_path)
    def read_resource(self):
        self.get_X();
        self.get_X_target();
        self.get_y();
    
    def get_X(self):
        self.X = self.get_row_pic(self.X_path)
    
    def get_X_target(self):
        self.X_target = self.get_row_pic(self.target_path)
    
    def get_y(self):
        self.y = self.read_solution(self.y_path)
    
    def get_row_pic(self, pic_path, gray=True):
        """get rowed pic data"""
        pic_paths = []  # pic path
        for pic in glob.glob(pic_path):
            pic_paths.append(pic)
            
        pic_paths.sort(self.sort_path)

        pics = []     # read raw pic data
        for pic in pic_paths:
            if gray:
                temp = ndimage.imread(pic, mode="L")
            else:
                temp = ndimage.imread(pic)
            
            pics.append(temp)

        pics = np.array(pics)    # convert list to numpy array
        nsamples = len(pics)

        row_pics = pics.reshape((nsamples, -1))    # pic serialized data
        return row_pics
    
    def sort_path(self, path1, path2):
        """sort the pic paths, make them in increase queue"""
        pattern = re.compile(r"\d+")
        num1 = int(pattern.search(path1).group())
        num2 = int(pattern.search(path2).group())
        return cmp(num1, num2)
    
    def read_solution(self, csv_path):
        """read csv file into dict"""
        csvfile = open(csv_path)
        reader = csv.reader(csvfile)
        headers = reader.next()

        column = {}
        for h in headers:
            column[h] = []


        for row in reader:
            for h, v in zip(headers, row):
                column[h].append(v)

        for k,v in column.items():
            column[k] = np.array(v)
        result = column['class']
        return result
    
    def write_csv(self, prediction):
        """write the prediction into csv file,
            for KAGGLE SUBMISSION
        """
        # write into csv
        f = open(self.export_path, "w")
        writer = csv.writer(f)

        writer.writerow(("id", "class"))
        for i in range(len(prediction)):
            writer.writerow((201+i, prediction[i]))
        f.close()
sa = SolarArray()
sa.read_resource()

In [15]:
# train
X_train, X_test, y_train, y_test = train_test_split(
    sa.X, sa.y, test_size=0.25, random_state=42)

# init PCA
n_components = 50
pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train)

X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)
X_target_pca = pca.transform(sa.X_target)
# print(X_target_pca.shape)

# train SVM model
param_grid = {'C': [1e3, 5e3, 1e4, 5e4, 1e5],
              'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], }
clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid)
clf = clf.fit(X_train_pca, y_train)
test_prediction = clf.predict(X_test_pca)
target_prediction = clf.predict(X_target_pca)

print(classification_report(y_test, test_prediction))
print(confusion_matrix(y_test, test_prediction))


print(target_prediction.shape)
# write to csv
sa.write_csv(target_prediction)

             precision    recall  f1-score   support

          0       0.55      0.78      0.65       105
          1       0.56      0.31      0.39        95

avg / total       0.56      0.56      0.53       200

[[82 23]
 [66 29]]
(130,)
