<h1>Decision Tree </h1>

<p>This code loads the GTD data and tries to build a decision tree.</p>
<p>
Referenced from <a>http://scikit-learn.org/stable/modules/tree.html#tree-algorithms</a>
</p>

In [4]:
class GTD:
    def __init__(self, data, target, feature_names, target_names):
        self.data = data
        self.target = target
        self.feature_names = feature_names
        self.target_names = target_names
        

In [5]:
import pandas as pd
import numpy as np
from sklearn import preprocessing

<h1>Load the data</h1>
<p>Load the data</p>
<p>Seperate the target and the inputs</p>

In [6]:
data = pd.read_csv('gtd_region_1.csv', encoding='latin1', low_memory=False)


# print(data)

# Seperate the target from the data

targetLabelEncoder = preprocessing.LabelEncoder()
targetLabelEncoder.fit(data['gname'].unique())
requiredtarget = targetLabelEncoder.transform(data['gname'])

requireduniquetarget = [data['gname'].unique(), list(range(len(data['gname'].unique())))]
# requiredtarget = data['gname'].map( {requireduniquetarget[0]: requireduniquetarget[1]}).astype(int)

from copy import deepcopy
requireddata = deepcopy(data)

<h1>Preprocessing of Data</h1>
<p>Remove any features that are not required<p>
<p>Set up data into the GTD object<p>

In [7]:
requireddata.drop(['approxdate', 'country_txt', 'region_txt', 'alternative_txt','attacktype1_txt',
                   'attacktype2_txt','attacktype3_txt','targtype1_txt','targsubtype1_txt','natlty1_txt',
                   'targtype2_txt','targsubtype2_txt','natlty2_txt','targtype3_txt','targsubtype3_txt',
                   'natlty3_txt','claimmode_txt','claimmode2_txt','claimmode3_txt','weaptype1_txt',
                   'weapsubtype1_txt','weaptype2_txt','weapsubtype2_txt','weaptype3_txt','weapsubtype3_txt',
                  'weaptype4_txt','weapsubtype4_txt','propextent_txt','hostkidoutcome_txt','dbsource'],axis=1,inplace=True)

gtd = GTD(requireddata.values, requiredtarget, list(requireddata), ['gname'])





In [8]:
# # For decision tree encode categorical data. To begin with start encoding year, country and region

from sklearn.feature_extraction import DictVectorizer
features = data[['iyear', 'country_txt', 'region_txt']] 

# print(features)

df = (pd.DataFrame(features)).to_dict(orient='records')
dv = DictVectorizer(sparse=False) 
output = dv.fit_transform(features.to_dict(orient='records'))

print(output)
# 


# gtdHotEncoder = preprocessing.OneHotEncoder()
# gtdHotEncoder.fit([requireddata['iyear'], requireddata['country'], requireddata['region']]) 
# gtdHotEncoder.n_values_
# gtdHotEncoder.feature_indices_

# a = gtdHotEncoder.transform([requireddata['iyear'], requireddata['country'], requireddata['region']]).toarray()
# print(a)

# f = open('out.txt', 'w')
# print >> f, a
# f.close()




[[  0.00000000e+00   1.00000000e+00   0.00000000e+00   1.97000000e+03
    1.00000000e+00]
 [  0.00000000e+00   0.00000000e+00   1.00000000e+00   1.97000000e+03
    1.00000000e+00]
 [  0.00000000e+00   0.00000000e+00   1.00000000e+00   1.97000000e+03
    1.00000000e+00]
 ..., 
 [  0.00000000e+00   0.00000000e+00   1.00000000e+00   2.01500000e+03
    1.00000000e+00]
 [  0.00000000e+00   0.00000000e+00   1.00000000e+00   2.01500000e+03
    1.00000000e+00]
 [  0.00000000e+00   0.00000000e+00   1.00000000e+00   2.01500000e+03
    1.00000000e+00]]


In [9]:
# np.savetxt('test.csv', output, delimiter=',')

In [10]:
dv.get_feature_names()

['country_txt=Canada',
 'country_txt=Mexico',
 'country_txt=United States',
 'iyear',
 'region_txt=North America']

<h1>Decision Tree</h1>
<p> Fit the data and target using decision tree classifier
</p>

In [125]:
from sklearn import tree
clf = tree.DecisionTreeClassifier(max_depth=8)
# gtddata = gtd.data[:,1]
# gtddata = np.column_stack((a, gtd.data[:,6]))
# gtddata = np.column_stack((a, gtd.data[:,7]))
clf = clf.fit(output, requiredtarget)

<p>Install <b>graphviz</b> package if you don't have one.</p>
<p>And install <b>pydotplot</b> - pip install pydotplus</p>

In [126]:
with open("gtd.dot", 'w') as f:
    f = tree.export_graphviz(clf, out_file=f,
                            feature_names = dv.get_feature_names(),
                             class_names = targetLabelEncoder.classes_,
                             filled = True, rounded=True,
                             special_characters=True)
    
    
#     dot -Tpdf gtd.dot -o gtd.pdf


In [127]:
# import pydotplus

# dot_data = tree.export_graphviz(clf, out_file=None)
# graph = pydotplus.graph_from_dot_data(dot_data)
# graph.write_pdf("gtd.pdf")

# from IPython.display import Image  
# dot_data = tree.export_graphviz(clf, out_file=None, 
#                          feature_names=gtd.feature_names,  
#                          class_names=gtd.target_names,  
#                          filled=True, rounded=True,  
#                          special_characters=True)  
# graph = pydotplus.graph_from_dot_data(dot_data)  
# Image(graph.create_png())  

In [128]:
clf.predict(output[:6, :])

array([3150, 3150, 3150, 3150, 3150, 3150])

In [129]:
clf.predict_proba(output[:6,:])

array([[  1.13855017e-04,   4.55420068e-05,   1.13855017e-05, ...,
          1.13855017e-05,   1.13855017e-05,   2.27710034e-05],
       [  1.13855017e-04,   4.55420068e-05,   1.13855017e-05, ...,
          1.13855017e-05,   1.13855017e-05,   2.27710034e-05],
       [  1.13855017e-04,   4.55420068e-05,   1.13855017e-05, ...,
          1.13855017e-05,   1.13855017e-05,   2.27710034e-05],
       [  1.13855017e-04,   4.55420068e-05,   1.13855017e-05, ...,
          1.13855017e-05,   1.13855017e-05,   2.27710034e-05],
       [  1.13855017e-04,   4.55420068e-05,   1.13855017e-05, ...,
          1.13855017e-05,   1.13855017e-05,   2.27710034e-05],
       [  1.13855017e-04,   4.55420068e-05,   1.13855017e-05, ...,
          1.13855017e-05,   1.13855017e-05,   2.27710034e-05]])

In [130]:
targetLabelEncoder.classes_

array(['1 May', '14 K Triad', '14 March Coalition', ...,
       'unk masked raiders',
       'unk, protest against USSR invasion of Afghanistan',
       'unknown leftists opposed to summit'], dtype=object)

In [131]:
# enc = preprocessing.OneHotEncoder()
# enc.fit([0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]) 
# enc.transform([[0, 1, 1]]).toarray()

In [None]:
from sklearn.cross_validation import cross_val_score
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=20, max_depth=4, min_samples_split=2)
scores = cross_val_score(clf, output, requiredtarget)
scores.mean() 

