In [151]:
import json, re
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from urllib.request import urlopen
from sklearn.feature_extraction import DictVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import *
from sklearn.metrics import accuracy_score, pairwise_distances
from sklearn.ensemble import RandomForestClassifier
# For transformations and predictions
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score


# For predictions
from sklearn.base import BaseEstimator
import scipy

In [152]:
with open(r'conditional_order.json', 'r') as f:
    conditional_order = json.load(f)
df = pd.DataFrame(conditional_order).set_index("מספר תיק")

In [153]:
seif_pattern = re.compile(r"(?:סעיף|ס'|סעיפים)\s{0,2}[\d()אבגדהוזחטיכלמנסעפצקרשת]+")

In [154]:
df["seifim"] = df["הוראות החיקוק שפורטו בהסדר"].apply(seif_pattern.findall)

In [155]:
df.drop(df.index[df["seifim"].apply(lambda s: s==[])], inplace = True) # we drop all other we didnt clean

In [156]:
df["seifim"].apply(lambda s: s==[]).value_counts()  #we will work with 501 enteries (from original 507)

False    501
Name: seifim, dtype: int64

In [157]:
#cleanfunction for all groups
def clenupDict(seifimitems):
 dictionary=dict()
 for k,v in seifimitems.items():
  new_key=(re.sub(r"\([^)]+\)|\(\d|[\אבגדהוזחטיכלמנסעפצקרשת]+", "", k))  #maybe remove here more
  dictionary.update({new_key:v})
 return (dictionary)

In [158]:
X = df["seifim"].apply(lambda seifim: {s.strip(" סעיפים סעיף' """):1 for s in seifim })  #coloumn is the seifim
X_new=X.apply(clenupDict)
vectorizer = DictVectorizer(sparse=False)
vectorizer.fit(X_new)
df_seifim = pd.DataFrame(vectorizer.transform(X_new), columns=vectorizer.feature_names_)

In [159]:
#now we have several coloumn with the samename lets aggrigate them
df_seifim.groupby(df_seifim.columns, axis=1).agg(np.max)

Unnamed: 0,Unnamed: 1,11,144,151,163,17,173,191,192,194,...,60,61,62,63,7,70,75,98,3,338
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [160]:
df_conditional_order = df.reset_index()\
  .join(df_seifim)\
  .rename(columns={"תיאור העובדות המהוות עבירה שבהן הודה החשוד": "facts", "מספר תיק":"id"})\
  .set_index("id")\
  .drop(["הוראות החיקוק שפורטו בהסדר", "נימוקים משתנים לסגירת התיק בהסדר", "תנאי ההסדר", "seifim", "יחידה"],axis=1)
df_conditional_order.head()

Unnamed: 0_level_0,facts,Unnamed: 2_level_0,11,144,151,163,17,173,191,192,...,60,61,62,63,7,70,75,98,3,338
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3373/15,"בתאריך 30.1.15, החזיק החשוד, לשם מסחר, בדוכן ב...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7937/15,"ביום 09/06/15 או סמוך לאחריו, מצא החשוד 12 המח...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4625/15,החשודות הינן סייעת וגננת בגן בצפון הארץ. ביום ...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3613/16,"1. במועדים הרלבנטיים לתלונת המתלוננת, עבד החשו...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9213/13,"הנאשמת הינה בעלת כלב מסוג ""רועה בלגי"". בשלהי ח...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [161]:
def cleanfacts(dataseries):  #clean data only dates, we can remove numbers as weel??
   return re.sub(r'בתאריך|ביום|או|[0-9]+.[0-9]+.[0-9]|בשנת|([0-9]{2}\.[0-9]{2}\.[0-9]{4})|([0-9]{2}\/[0-9]{2}\/[0-9]{4})+',"",dataseries)

In [162]:
df_conditional_order["clean_facts"] = df_conditional_order["facts"].apply(cleanfacts)

In [163]:
vectorizer = CountVectorizer(binary=True)  #thisapproch isnt sogood, cause the correlation should be low
X = vectorizer.fit_transform(df_conditional_order["clean_facts"])
cols = [k for k,v in sorted(vectorizer.vocabulary_.items(), key= lambda t:t[1])]
X = pd.DataFrame(X.todense(), columns=cols)

In [164]:
#option 1-intuitive(like inclass, just cleaner data)
y = df_conditional_order["415"] 
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=20191120)

model = RandomForestClassifier(random_state=10, n_estimators=100,max_depth=5)
model.fit(X_train, y_train)
print ("RandomForest accuracy in train:",accuracy_score(model.predict(X_train), y_train))
print ("RandomForest accuracy in test:",accuracy_score(model.predict(X_test), y_test))

RandomForest accuracy in train: 0.8742857142857143
RandomForest accuracy in test: 0.8543046357615894


In [165]:
#option 1-intuitive(all)
#y = df_conditional_order["415"] 
y=df_conditional_order[["361","338","379","3","60","420","418","348","415"]]
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=20191120)

model = RandomForestClassifier(random_state=5, n_estimators=100,max_depth=5)
model.fit(X_train, y_train)
print ("RandomForest accuracy in train:",accuracy_score(model.predict(X_train), y_train))
print ("RandomForest accuracy in test:",accuracy_score(model.predict(X_test), y_test))

RandomForest accuracy in train: 0.3342857142857143
RandomForest accuracy in test: 0.33774834437086093


In [166]:
def get_category():
  lst=[]
  new_df_cat=df_conditional_order.drop(['facts','',"clean_facts"], axis=1)
  for i,row in new_df_cat.iterrows():
    if (1 in row.values):
      val=(row.iloc[row.values==1].index[0])
      #for simplicity willonly ove one category per text
      #s=[]
      #for v in val:
       # s.append(v)
      lst.append(val)
  lst.append('Nan')
  return lst

In [167]:
#option 2- try create our category according to the map and run using the models
df_conditional_order['category']=get_category()    

In [168]:
df_conditional_model_order =df_conditional_order[['clean_facts','category']]  #this will be our initiative data set for the models

In [169]:
#create the entire coloumns to one with  one category
unique_cat = set(df_conditional_model_order['category'].unique())
df_conditional_model_order['category'] = pd.Categorical(df_conditional_model_order['category'], categories=unique_cat).codes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [170]:
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=20191120)

In [171]:
#cosine matric
k=30
model = KNeighborsClassifier(n_neighbors=k,metric='cosine')
model.fit(X_train, y_train)
#df_conditional_model_order['KNN_Pred'] = model.predict(X)

print ("knn accuracy in train:",accuracy_score(model.predict(X_train), y_train))
print ("knn accuracy in test:",accuracy_score(model.predict(X_test), y_test))

knn accuracy in train: 0.5685714285714286
knn accuracy in test: 0.5364238410596026


In [172]:
k=10
model = KNeighborsClassifier(n_neighbors=k,metric='hamming')
model.fit(X_train, y_train)
#df_conditional_model_order['KNN_Pred'] = model.predict(X)

print ("haming knn accuracy in train:",accuracy_score(model.predict(X_train), y_train))
print ("haming knn accuracy in test:",accuracy_score(model.predict(X_test), y_test))

haming knn accuracy in train: 0.4142857142857143
haming knn accuracy in test: 0.36423841059602646


In [173]:
#now lets try RandomForest
model = RandomForestClassifier(random_state=3, n_estimators=100,max_depth=3)
model.fit(X_train, y_train)
print ("RandomForest accuracy in train:",accuracy_score(model.predict(X_train), y_train))
print ("RandomForest accuracy in test:",accuracy_score(model.predict(X_test), y_test))

RandomForest accuracy in train: 0.28
RandomForest accuracy in test: 0.2913907284768212


In [174]:
#option3- create our own transformer for the text
class CustomEncoderTransformer(BaseEstimator, TransformerMixin):
    def __init__(self,cat,mapped_dic):#,maybe create here the dic according to input dic 
        TransformerMixin.__init__(self)
        self.cat=cat
        self.cat_dict=mapped_dic
    def fit(self, X, y=None):
        return self
    def transform(self, X):
      lst=[]
      for text in X:
        v=self.cat_dict[self.cat]
        r=re.compile(v)
        if len(r.findall(text))>1:
          lst.append(1)
        else:
          lst.append(0)
      return(lst)
      def fit_transform(self,X):
        return self.transform(self,X)

In [175]:
#ppl = Pipeline([
#    ("my_encoder", CustomHotEncoder())
#])
#ppl.fit(df_conditional_order["clean_facts"], df_conditional_model_order['category'])
#created it according to the number of enteriesfromthe text and the seif, took only 9 features
mapped_dic={"415":r"(?:כזב|מרמה|קבלת דבר במרמה|שווא)","348":r"(?:נשיקה|נשקה|מעשה מגונה|הסכמתה|ללא הסכמה|סיפוק מיני|גירוי|מעשים מגונים|לבוש מפירים|איבר מינה)","418":r"(?:זיוף|זיוף תרופות|מזויפת)","420":r"(?:שימוש במסמך מזויף|מסמך מזויף|ערבות מזויפת|זייף המחאה|המחאה|)","60":r"(?:סימן|סימן מסחר|סימני מסחר|בלא רשות)","3":r"(?:הטרדה מינית|סחיטה|מעשים מגונים|מיני)","379":r"(?:מנע|נותר לעמוד|דחף)","338":r"(?:פזיזות|רשלנות|נפיץ|מותירה)","361":r"(?:|נטישה||בלא השגחה|ללא השגחה|משוטט לבדו)"}

features_cat=["415","348","418","420","60","3","379","338","361"] 
for category in features_cat:
  #my_CustomHotEncoder = CustomEncoderTransformer(category)
  name="category "+str(category)
  my_CustomHotEncoder = CustomEncoderTransformer(category,mapped_dic)
  df_conditional_order[name]=my_CustomHotEncoder.fit_transform(df_conditional_order["clean_facts"])

In [176]:
#now lets test
X_n=df_conditional_order[["category 415"]]
y = df_conditional_order["415"] 


X_train, X_test, y_train, y_test = \
    train_test_split(X_n, y, train_size=0.7, test_size=0.3, random_state=20191120)

model = RandomForestClassifier(random_state=100, n_estimators=10,max_depth=20)#100,100,10
model.fit(X_train, y_train)
print ("RandomForest accuracy in train:",accuracy_score(model.predict(X_train), y_train))
print ("RandomForest accuracy in test:",accuracy_score(model.predict(X_test), y_test))

RandomForest accuracy in train: 0.8942857142857142
RandomForest accuracy in test: 0.8807947019867549


In [177]:
#now lets test, only for 348
X_n=df_conditional_order[["category 348"]]
y = df_conditional_order["348"] 
#y=df_conditional_model_order['category']
#print(X_n)
#print(y)

X_train, X_test, y_train, y_test = \
    train_test_split(X_n, y, train_size=0.7, test_size=0.3, random_state=20191120)

model = RandomForestClassifier(random_state=100, n_estimators=10,max_depth=10)#100,100,10
model.fit(X_train, y_train)
print ("RandomForest accuracy in train:",accuracy_score(model.predict(X_train), y_train))
print ("RandomForest accuracy in test:",accuracy_score(model.predict(X_test), y_test))

RandomForest accuracy in train: 0.9314285714285714
RandomForest accuracy in test: 0.9470198675496688


In [178]:
#now lets test, only for 418 and soonforeach one
X_n=df_conditional_order[["category 418"]]
y = df_conditional_order["418"] 
#y=df_conditional_model_order['category']
#print(X_n)
#print(y)

X_train, X_test, y_train, y_test = \
    train_test_split(X_n, y, train_size=0.7, test_size=0.3, random_state=20191120)

model = RandomForestClassifier(random_state=10, n_estimators=5,max_depth=5)#100,100,10
model.fit(X_train, y_train)
print ("RandomForest accuracy in train:",accuracy_score(model.predict(X_train), y_train))
print ("RandomForest accuracy in test:",accuracy_score(model.predict(X_test), y_test))

RandomForest accuracy in train: 0.8685714285714285
RandomForest accuracy in test: 0.9006622516556292


In [179]:
#now lets test, all
X_n=df_conditional_order[["category 361","category 338","category 379","category 3","category 60","category 420","category 418","category 348","category 415"]]
#y = df_conditional_order["361"] 
y=df_conditional_model_order['category']


X_train, X_test, y_train, y_test = \
    train_test_split(X_n, y, train_size=0.7, test_size=0.3, random_state=20191120)

model = RandomForestClassifier(random_state=100, n_estimators=50,max_depth=50)#100,100,10
model.fit(X_train, y_train)
print ("RandomForest accuracy in train:",accuracy_score(model.predict(X_train), y_train))
print ("RandomForest accuracy in test:",accuracy_score(model.predict(X_test), y_test))

RandomForest accuracy in train: 0.2257142857142857
RandomForest accuracy in test: 0.2052980132450331


In [180]:
#now lets test, all
X_n=df_conditional_order[["category 361","category 338","category 379","category 3","category 60","category 420","category 418","category 348","category 415"]]
y=df_conditional_order[["361","338","379","3","60","420","418","348","415"]]

X_train, X_test, y_train, y_test = \
    train_test_split(X_n, y, train_size=0.7, test_size=0.3, random_state=20191120)

model = RandomForestClassifier(random_state=10, n_estimators=50,max_depth=5)#100,100,10
model.fit(X_train, y_train)
print ("RandomForest accuracy in train:",accuracy_score(model.predict(X_train), y_train))
print ("RandomForest accuracy in test:",accuracy_score(model.predict(X_test), y_test))
print (classification_report(y_true=y_test,y_pred=model.predict(X_test)))
print (classification_report(y_true=y_train,y_pred=model.predict(X_train)))

RandomForest accuracy in train: 0.4857142857142857
RandomForest accuracy in test: 0.47019867549668876
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        14
          1       0.00      0.00      0.00        17
          2       1.00      0.23      0.38        13
          3       1.00      0.33      0.50         6
          4       1.00      0.77      0.87        13
          5       0.62      0.28      0.38        18
          6       0.56      0.31      0.40        16
          7       0.92      0.61      0.73        18
          8       0.88      0.29      0.44        24

avg / total       0.64      0.31      0.40       139

             precision    recall  f1-score   support

          0       0.00      0.00      0.00        32
          1       0.00      0.00      0.00        20
          2       0.67      0.11      0.18        19
          3       0.71      0.42      0.53        24
          4       0.96      0.72      0.83    

  'precision', 'predicted', average, warn_for)


In [181]:
#appendix- lets try correlations ,lets see if we have correlation betweenchildneglect to summer
#lets firts find what is summer
from datetime import datetime
def get_summer_time(data):
  date_pattren=re.compile(r"[0-9]{2}\.[0-9]{2}\.[0-9]{4}|[0-9]{2}\/[0-9]{2}\/[0-9]{4}")
  v=date_pattren.findall(data) 
  if len(v)>0:  
    dd=v[0]
    try:
      results=datetime.strptime(v[0],'%d/%m/%Y').month
    except ValueError:
      results=datetime.strptime(v[0],'%d.%m.%Y').month
    if ((results>=5) and (results<=10)):  #summer
      return True
    else:
      return False
  else:
   return False


In [182]:
df_conditional_order['summer']=df['תיאור העובדות המהוות עבירה שבהן הודה החשוד'].apply(get_summer_time)

In [183]:
X_n=df_conditional_order[["361"]]  #ילדים באוטו
y=df_conditional_order['summer']

X_train, X_test, y_train, y_test = \
    train_test_split(X_n, y, train_size=0.7, test_size=0.3, random_state=20191120)

model = RandomForestClassifier(random_state=10, n_estimators=10,max_depth=10)#100,100,10
model.fit(X_train, y_train)
print ("RandomForest accuracy in train:",accuracy_score(model.predict(X_train), y_train))
print ("RandomForest accuracy in test:",accuracy_score(model.predict(X_test), y_test))
print (classification_report(y_true=y,y_pred=model.predict(X_n)))

RandomForest accuracy in train: 0.9657142857142857
RandomForest accuracy in test: 0.9403973509933775
             precision    recall  f1-score   support

      False       0.96      1.00      0.98       480
       True       0.00      0.00      0.00        21

avg / total       0.92      0.96      0.94       501



  'precision', 'predicted', average, warn_for)


In [184]:
#an example
df_summer = pd.concat([
  df_conditional_order[["summer", "361"]].reset_index(),
  pd.DataFrame(model.predict(X_n),columns=["predicted"])],
  axis=1
).rename(columns={"summer": "summer_actual"})
#df_summer

In [185]:
df_summer[(df_summer.summer_actual==df_summer.predicted) & (df_summer.summer_actual==0) & (df_summer["361"]==1)]  #הזנחת ילדים לא קרה בקיץ!!

Unnamed: 0,id,summer_actual,361,predicted
2,4625/15,False,1.0,False
12,2524/17,False,1.0,False
56,4056/15,False,1.0,False
77,5553/14,False,1.0,False
98,4567/15,False,1.0,False
131,3680/16,False,1.0,False
132,6254/16,False,1.0,False
135,995/16,False,1.0,False
137,1874/16,False,1.0,False
160,4942/14,False,1.0,False
