Code: Python3 that implements mistake-bound 'Perceptron' algorithm for multi-class
classification. It uses Kesler's construction as outlined in
http://l2r.cs.illinois.edu/~danr/Teaching/CS446-15/Lectures/07-LecMulticlass.pdf.
The update rule is 'conservative', the definition of which is provided on
slide-32 of this pdf.

Dataset: http://www.start.umd.edu/gtd/
"The Global Terrorism Database (GTD) is an open-source database including
information on terrorist events around the world from 1970 through 2014 (with
annual updates planned for the future)."

Problem: Using 'features' associated with an incident like location, weapon-type etc.
to predict the group (outfit) associated or responsible for the incident. The
current implementation uses 25 features out of many provided at the source link.

Input: The program expects the file 'gtd.data' in the run folder. info.log, eval.txt

In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.cross_validation import train_test_split
import logging
from enum import Enum                 

class classifier(object):
  """
  based class from which each classification algorithm is derived
  """

  def __init__(self):
    pass

  def test(self):
    """
    purely virutal method
    """                      
    raise NotImplementedError(':test')

  def train(self):
    """
    purely virutal method
    """                      
    raise NotImplementedError(':train')

  def eval(self):
    """
    purely virutal method
    """                      
    raise NotImplementedError(':eval')

class dataset():
  """
  class to store information related to input/output data
  """
  def __init__(self, filename):
    if not os.path.exists(filename):
      raise LookupError('Input file not found %s', filename)

    self.labelMap = {} # stores maps {label_names -> whole_numbers}            
    self.reverseLabelMap = {} # stores maps {whole_numbers -> label_names}
    self.features = []
    self.labels = []

  def splitData(self, testSplit : 'float'):
    """
    uses the sklearn.cross_validation module to perform a random splitting of
    the input data
    """
    self.features_train, self.features_test, self.labels_train, self.labels_test = \
      train_test_split(self.features, self.labels, test_size=testSplit)

In [None]:
class gtd(dataset):
  """
  class for the gtd dataset
  """
  def __init__(self, filename):

    # typedefs ('float' instead of 'int', since pandas has trouble when
    # dtype=int has Nan in the input file)
    categorical = np.float32 
    boolean = np.float32 

    # dictionary of important features (manually selected) to start analysis
    # we could use PCA/RandomForest/other-techniques to further reduce dimensionality 
    self.relevantFeatures = {
        'eventid':int,
        'extended':boolean,
        'country':categorical,
        'region':categorical,
        'latitude':np.float32,
        'longitude':np.float32,
        'attacktype1':categorical,
        'success':boolean,
        'suicide':boolean,
        'crit1':boolean,
        'crit2':boolean,
        'crit3':boolean,
        'multiple':boolean,
        'targtype1':categorical,
        'natlty1':categorical,
        'guncertain1':boolean,
        'claimed':boolean,
        'weaptype1':categorical,
        'nkill':np.float32,
        'nwound':np.float32,
        'property':boolean,
        'ishostkid':boolean,
        'INT_LOG':boolean,
        'INT_IDEO':boolean,
        'INT_MISC':boolean,
        'INT_ANY':boolean,
        'gname':str
    }
    
    super().__init__(filename)
    self.readFile(filename)

  def readFile(self, filename):
    # read data into a data-frame
    data = pd.read_csv(filename, header=0, skiprows=0,\
        sep=r",", na_values=" NaN", dtype=self.relevantFeatures, low_memory=False)

    #select relevant features
    data = data[list(self.relevantFeatures.keys())] 

    # ad-hoc : dropping the rows with a NaN. We could do something intelligent
    # here like deducing the hidden/missing variables
    data.dropna(inplace=True)

    # separate the dataframe to be predicted
    evalFrame = data[data['gname']=='Unknown']

    # fill the labels (group-name) and features
    data = data[data['gname']!='Unknown']
    self.labels = list(data['gname']) 
    data.drop(['gname', 'eventid'], axis=1, inplace=True)
    self.features=np.array(data)
    
    # add a column of 1's to the feature vector since we'll absort theta into the
    # w-matrix
    self.features = np.append(self.features, np.ones([self.features.shape[0], 1]), axis=1)

    # fill maps
    for label, index in zip(set(self.labels), range(len(set(self.labels)))):
      self.labelMap[index] = label
      self.reverseLabelMap[label] = index

    # change the labels on input data to whole numbers
    for index in range(len(self.labels)):
      self.labels[index] = self.reverseLabelMap[self.labels[index]]

    self.num_labels = self.labelMap.__len__() # number of 'unique' labels
    self.features_dim = self.features.shape[1] # dimensionality of data

    # fill the features for evaluation using the 'evalFrame' dataframe that was
    # created earlier
    self.eventids_eval = list(evalFrame['eventid'])  # event-ids for the incidents with unknown group
    evalFrame.drop(['gname', 'eventid'], axis=1, inplace=True)
    self.features_eval = np.array(evalFrame)

    # add a column of 1's to the feature vector 
    self.features_eval = np.append(self.features_eval, np.ones([self.features_eval.shape[0], 1]), axis=1)

In [None]:
logger = logging.getLogger(__name__)
handler = logging.FileHandler('info.log', mode='w')
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)
logger.setLevel(logging.DEBUG)

class perceptron(classifier):
  """
  multi-class perceptron 
  """
  def __init__(self, d : 'dataset', **kwds):
    super().__init__()                    
    assert isinstance(d, dataset)

    # the weight matrix is rxd, r=#labels, d=featureDimension+1
    self.weights = np.zeros([d.num_labels, d.features_dim])
    self.learningRate = kwds['learningRate']
    self.maxIterations = kwds['maxIterations']
    self.dataset = d
  
  def eval(self):
    logger.info('Begin evaluating Perceptron')
    fd = open('eval.txt', 'w')

    for _id, feature in zip(self.dataset.eventids_eval, \
        self.dataset.features_eval):
      prediction = self.dataset.labelMap[self.getLabel(feature)]
      fd.write("Eventid:" + str(_id) +  "\tPredicted group:" + prediction+"\n")

    fd.close()

  def test(self):
    logger.info('Begin testing Perceptron')

    mistakes = int(0)
    for feature, label in zip(self.dataset.features_test, self.dataset.labels_test):
      prediction = self.getLabel(feature)
      if prediction != label:
        mistakes = mistakes + 1

    print('Test accuracy={:f}'.format(1-mistakes/len(self.dataset.labels_test)))

  def getLabel(self, feature) -> 'label':
    prediction = np.argmax(np.dot(self.weights, feature))
    return prediction

  def train(self):
    logger.info('Begin training Perceptron')
    logger.info("#Training Samples=%i" %len(self.dataset.labels_train))
 
    iterations = int(0)
    mistakes = int(0)

    while True:
      
      iterations = iterations + 1 
      if iterations == self.maxIterations:
        logger.info('Reached maximum iteration count. Terminating training')
        print('Training accuracy={:f}'.format(1-mistakes/len(self.dataset.labels_train)))
        break

      mistakes = 0  # reset
      for feature, label in zip(self.dataset.features_train, self.dataset.labels_train):
        prediction = self.getLabel(feature)
        
        # mistake-bound perceptron algorithm using Keslers's construction. If
        # case of a misprediction, we increase weights corresponding to the
        # correct label, and decrease weights corresponding to the predicted
        # label
        if prediction != label:
          u_mat = np.zeros(self.weights.shape)
          u_mat[label,:] = self.learningRate*feature
          u_mat[prediction,:] = (-1.)*self.learningRate*feature
           
          # update the weights matrix 
          self.weights = np.add(self.weights, u_mat)
          mistakes = mistakes + 1
      
      if not mistakes:
        logger.info('Training complete. No mistakes on training data')
        break
      else:
        logger.debug("Completed iteration %i  #Mistakes %i" %(iterations, mistakes))

In [None]:
class algo(str, Enum):
  """
  insert all supported algorithms here
  """
  perceptron = 'perceptron'

  @property
  def method(self):
    return algo.get_method(self) 

  @classmethod
  def get_method(cls, a) -> 'class object':
    if a is cls.perceptron:
      return perceptron
    else:
      raise KeyError("Unrecognized algorithm %s" % a)

def run(filename : 'str', algorithm : 'str', kwds):
  d = gtd(filename)
  d.splitData(kwds['testSplit'])
                          
  # select the algorithm to run and train                        
  sel = algo(algorithm).method(d, **kwds)
  sel.train()
  sel.test()
  sel.eval()

In [None]:
filename = 'gtd.data' # expects csv          
run(filename, 'perceptron', {'testSplit':0.2, 'learningRate':0.005, 'maxIterations':10000})

(Prel. results):
The following results are using the 2011-14 data-file provided at the source
link. The perceptron algorithm is run on labeled data (with assigned groups -
19075 out of 42373). 80-20 cross-validation is used. The accuracies seen after 10000
iterations and a learning rate of 0.005 are:

Training accuracy=0.773
Test accuracy=0.747

The file 'eval.txt' contains the predictions for the data that was not labeled in
the input (23298 out of 42373). The incidents are identified by the eventid field provided in source. 