In [1]:
import os
import numpy as np
import pandas as pd
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import to_categorical
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split as SPLIT
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

from sklearn.pipeline import Pipeline

Using TensorFlow backend.


In [2]:
seed = 7
np.random.seed(seed)

In [22]:
def baseline_model():
    # create model
    model = Sequential()
    model.add(Dense(8, input_dim=1, activation='relu'))
    model.add(Dense(3, activation='softmax'))
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

estimator = KerasClassifier(build_fn=baseline_model, epochs=10, batch_size=2, verbose=0)
kfold = KFold(n_splits=3, shuffle=True, random_state=seed)

In [23]:
#https://stanford.edu/~shervine/blog/keras-how-to-generate-data-on-the-fly
class DataGenerator(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, list_IDs, labels, batch_size=32, dim=(32,32,32), n_channels=1, n_classes=10, shuffle=True):
        'Initialization'
        self.dim = dim
        self.batch_size = batch_size
        self.labels = labels
        self.list_IDs = list_IDs
        self.n_channels = n_channels
        self.n_classes = n_classes
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.list_IDs) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        #indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        # Find list of IDs
        #list_IDs_temp = [self.list_IDs[k] for k in indexes]

        # Generate data
        #X, y = self.__data_generation(list_IDs_temp)
        X, y = self.list_IDs, self.labels

        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.list_IDs))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, list_IDs_temp):
        'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
        # Initialization
        X = np.empty((self.batch_size, *self.dim, self.n_channels))
        y = np.empty((self.batch_size), dtype=int)

        # Generate data
        for i, ID in enumerate(list_IDs_temp):
            # Store sample
            #X[i,] = np.load('data/' + ID + '.npy')
            X[i,] = list_IDs

            # Store class
            y[i] = self.labels[ID]

        return X, to_categorical(y, num_classes=self.n_classes)

In [24]:
# Parameters
def get_params(n_classes):
    return {'dim': (32,32,32),
          'batch_size': 64,
          'n_classes': n_classes,  # the num of unique classes for a Y-set
          'n_channels': 1,
          'shuffle': True}

# Datasets
#partition = # IDs
#labels = # Labels

# Generators
def get_generator(df, labels, params):
    #training_generator = DataGenerator(partition['train'], labels, **params)
    return DataGenerator(df, labels, **params)

#validation_generator = DataGenerator(partition['validation'], labels, **params)



In [25]:
def dic(d):
    count = 0
    for k,v in d.items():
        if count > 20:
            return
        count += 1
        print(k, v)

In [26]:
# the folder to hold the datasets (csv) with [text, label]
single_label_dir = os.path.join(os.getcwd(), 'single-label')

categories = os.listdir(single_label_dir)
print(categories)
unique_labels = len(categories)

for categ in categories:
    print(categ)
    categ_path = os.path.join(single_label_dir, categ)
    categ_save_path = os.path.join(single_label_dir, categ) + '.csv'
    print(categ_save_path)
    
    vals = os.listdir(categ_path)
    print(vals)
    df = None
    if not os.path.isfile(categ_save_path):
        categ_frames =  []
        for val in vals:
            dataset = categ_path + '\\' + str(val)
            categ_frames.append(pd.read_csv(dataset))
        df = pd.concat(categ_frames, axis=0, ignore_index=True)
        df.to_csv(categ_save_path)
    else:
        df = pd.read_csv(categ_save_path)
        
    # GENERATOR COMPUTATION OF EACH MAJOR CSV!
    print(df.head())
    print(df.shape)
    
    # create a train/validation set of the text-column
    
    train, validation = SPLIT(df, test_size=0.1)
    print(train.head())
    print(validation.head())
    train_text = validation.text
    #print(train_text[:20])
    validation_text = validation.text
    #print(validation_text[:10])
    
    # labels is a dict of "TEXT"->"LABEL" (key->val)
    KEY = 'text'
    train_labels = train.set_index(KEY)[categ].to_dict()
    dic(train_labels)
    validation_labels = validation.set_index(KEY)[categ].to_dict()
    dic(validation_labels)
    
    # Design model - new for each category
    model = baseline_model()

    # Train model on dataset
    print('{} unique labels'.format(unique_labels))
    params = get_params(unique_labels)
    print(params)
    train_gen = get_generator(train_text, train_labels, params)
    test_gen = get_generator(validation_text, validation_labels, params)
    model.fit_generator(generator=train_gen, validation_data=test_gen)

    del df
    del X_train
    del X_test
    del y_train
    del y_test
    #le = LabelEncoder()
    #le.fit(_Y)
    #le_Y = le.transform(_Y)
    #Y_hot = np_utils.to_categorical(le_Y)
    #results = cross_val_score(estimator, _X, Y_hot, cv=kfold)

    break


['birthyear', 'birthyear.csv', 'birthyearOG.csv', 'fame', 'gender', 'occupation']
birthyear
C:\Users\Tollef\Desktop\Spring2019\TextAnalysis\project\PAN-celebrity-profiling\data\single-label\birthyear.csv
['1940.csv', '1941.csv', '1942.csv', '1943.csv', '1944.csv', '1945.csv', '1946.csv', '1947.csv', '1948.csv', '1949.csv', '1950.csv', '1951.csv', '1952.csv', '1953.csv', '1954.csv', '1955.csv', '1956.csv', '1957.csv', '1958.csv', '1959.csv', '1960.csv', '1961.csv', '1962.csv', '1963.csv', '1964.csv', '1965.csv', '1966.csv', '1967.csv', '1968.csv', '1969.csv', '1970.csv', '1971.csv', '1972.csv', '1973.csv', '1974.csv', '1975.csv', '1976.csv', '1977.csv', '1978.csv', '1979.csv', '1980.csv', '1981.csv', '1982.csv', '1983.csv', '1984.csv', '1985.csv', '1986.csv', '1987.csv', '1988.csv', '1989.csv', '1990.csv', '1991.csv', '1992.csv', '1993.csv', '1994.csv', '1995.csv', '1996.csv', '1997.csv', '1998.csv', '1999.csv', '2000.csv', '2001.csv', '2002.csv', '2003.csv', '2004.csv', '2005.csv', '20

ValueError: No data provided for "dense_20". Need data for each key in: ['dense_20']