<a href="https://colab.research.google.com/github/sundarp17/sundar_info5731_fall2020/blob/master/svm_highkappa_semisupervised_unbalanced.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

train_kappa = pd.read_csv(r'/content/train_data_highkappa.csv')
train_kappa.head()

Unnamed: 0.1,Unnamed: 0,Sentence,Target
0,659,Appellant had stated to the officers that she ...,Invalid
1,3456,We shall discuss the facts more fully in conne...,Others
2,2043,"â€œPerjury is a false statement, either writte...",Invalid
3,3344,The offense is felony theft by false pretext; ...,Issue
4,3231,Numerous contentions urging the commission of ...,Issue


In [2]:
train_kappa['Target'].unique()

array(['Invalid', 'Others', 'Issue', 'Analysis', 'Facts', 'Conclusion',
       'Rule/Law/Holding'], dtype=object)

In [3]:
train_kappa['Target']=train_kappa['Target'].replace(['Others'],'Invalid')
train_kappa['Target'].unique()

array(['Invalid', 'Issue', 'Analysis', 'Facts', 'Conclusion',
       'Rule/Law/Holding'], dtype=object)

In [4]:
#cleaning
import nltk
import re
import string
nltk.download('stopwords')
nltk.download('wordnet')
stopword=nltk.corpus.stopwords.words('english')
from nltk.stem import WordNetLemmatizer
wl= WordNetLemmatizer()

def clean_text(text):
  text="".join([word.lower() for word in text if word not in string.punctuation])
  tokens = re.split('\W+',text)
  text = [wl.lemmatize(word) for word in tokens if word not in stopword]
  return text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(analyzer = clean_text)
X_tfidf = tfidf_vect.fit_transform(train_kappa['Sentence'])
print(X_tfidf.shape)

(2951, 5835)


In [7]:
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.calibration import CalibratedClassifierCV


X_train, x_val, Y_train, y_val = train_test_split(X_tfidf,train_kappa['Target'],test_size=0.26,random_state=42)
support = svm.LinearSVC()
clf = support.fit(X_train, Y_train)
calibrator = CalibratedClassifierCV(clf, cv='prefit')
model=calibrator.fit(X_train, Y_train)
pred_svm = model.predict(x_val)
print('Accuracy %s' % accuracy_score(pred_svm,y_val))
print(classification_report(y_val,pred_svm))

Accuracy 0.6848958333333334
                  precision    recall  f1-score   support

        Analysis       0.54      0.51      0.52       104
      Conclusion       0.56      0.54      0.55        41
           Facts       0.79      0.88      0.83       416
         Invalid       0.49      0.45      0.47       109
           Issue       0.64      0.38      0.48        55
Rule/Law/Holding       0.42      0.37      0.40        43

        accuracy                           0.68       768
       macro avg       0.57      0.52      0.54       768
    weighted avg       0.67      0.68      0.67       768



In [8]:
def frequency_table(data):
    frequencytable = {}
    for key in data:
        if key in frequencytable:
            frequencytable[key] += 1
        else:
            frequencytable[key] = 1
    return frequencytable

frequency_table(train_kappa['Target'])

{'Analysis': 439,
 'Conclusion': 147,
 'Facts': 1510,
 'Invalid': 475,
 'Issue': 190,
 'Rule/Law/Holding': 190}

In [9]:
test = pd.read_csv(r'/content/test_data.csv')
test.head()

Unnamed: 0.1,Unnamed: 0,Sentence,Target
0,2173,Darden Sr. testified â€œI saw he was staggerin...,Facts
1,466,Additional moral justification may have been d...,Analysis
2,525,"85, 22 S.W. 140, wherein the accused was actin...",Rule/Law/Holding
3,2199,Under a proper instruction from the court on m...,Facts
4,79,He is eligible for leaves to renew contacts wi...,Facts


In [10]:
test['Target']=test['Target'].replace(['Others'],'Invalid')
test['Target'].unique()

array(['Facts', 'Analysis', 'Rule/Law/Holding', 'Conclusion', 'Invalid',
       'Issue'], dtype=object)

In [12]:
test['Sentence'] = test['Sentence'].apply(lambda x: " ".join(x.lower() for x in str(x).split()))
test['Sentence'] = test['Sentence'].str.replace('[^\w\s]','')
from nltk.corpus import stopwords
words = stopwords.words('english')
test['Sentence'] = test['Sentence'].apply(lambda x: " ".join(x for x in x.split() if x not in words))

In [13]:
test['Sentence'].head()

0    darden sr testified âœi saw staggering â prett...
1    additional moral justification may due complai...
2    85 22 sw 140 wherein accused acting upon advic...
3    proper instruction court murder malice murder ...
4                eligible leaves renew contacts family
Name: Sentence, dtype: object

In [14]:
len(test['Sentence'])

521

In [15]:
unlabel_1 = test.loc[:100]
print("length of unlabel_1",len(unlabel_1))
unlabel_2 = test.loc[101:200]
print("length of unlabel_2",len(unlabel_2))
unlabel_3 = test.loc[201:300]
print("length of unlabel_3",len(unlabel_3))
unlabel_4 = test.loc[301:400]
print("length of unlabel_4",len(unlabel_4))
unlabel_5 = test.loc[401:]
print("length of unlabel_5",len(unlabel_5))

length of unlabel_1 101
length of unlabel_2 100
length of unlabel_3 100
length of unlabel_4 100
length of unlabel_5 120


In [16]:
unlabel_1.head()

Unnamed: 0.1,Unnamed: 0,Sentence,Target
0,2173,darden sr testified âœi saw staggering â prett...,Facts
1,466,additional moral justification may due complai...,Analysis
2,525,85 22 sw 140 wherein accused acting upon advic...,Rule/Law/Holding
3,2199,proper instruction court murder malice murder ...,Facts
4,79,eligible leaves renew contacts family,Facts


In [17]:
unlabel_2.reset_index(inplace=True)
del unlabel_2['index']
unlabel_2.head()

Unnamed: 0.1,Unnamed: 0,Sentence,Target
0,1207,examination occurred night middle may testifie...,Invalid
1,3400,undisputed testimony reflects appellant drivin...,Facts
2,2072,appellant testify offer evidence behalf,Facts
3,517,56 uniform act regulating traffic highways gis...,Rule/Law/Holding
4,906,agreed accompany gave pawn ticket money reclai...,Facts


In [18]:
unlabel_3.reset_index(inplace=True)
del unlabel_3['index']
unlabel_3.head()

Unnamed: 0.1,Unnamed: 0,Sentence,Target
0,1882,find evidence amply sufficient sustain juryâs ...,Analysis
1,964,actions show thatâ,Invalid
2,1417,certification attending physician shown filed ...,Facts
3,2932,testified agent bland first left automobile wa...,Facts
4,1584,officer testified could smell odor alcohol app...,Facts


In [19]:
unlabel_4.reset_index(inplace=True)
del unlabel_4['index']
unlabel_4.head()

Unnamed: 0.1,Unnamed: 0,Sentence,Target
0,266,section 6 pl,Invalid
1,3469,amended answer appellant alleged bond invalid ...,Analysis
2,1471,evidence sufficient support conviction appella...,Analysis
3,1990,cases cited note 19,Invalid
4,2757,july 8 1960 order entered revoking said probat...,Facts


In [20]:
unlabel_5.reset_index(inplace=True)
del unlabel_5['index']
unlabel_5.head()

Unnamed: 0.1,Unnamed: 0,Sentence,Target
0,2979,overrule appellantâs contention court erred pe...,Analysis
1,2135,state regarded authority case,Invalid
2,1591,testimony adduced appellant mother lillie mae ...,Facts
3,658,appellant needle marks inside left elbow,Facts
4,2562,analysis contents 3 capsules chemist kenneth a...,Facts


In [21]:
model.classes_

array(['Analysis', 'Conclusion', 'Facts', 'Invalid', 'Issue',
       'Rule/Law/Holding'], dtype='<U16')

In [22]:
x_un1 = tfidf_vect.transform(unlabel_1['Sentence'])
pred_unlabel_1 = model.predict_proba(x_un1)
pred_unlabel_1

array([[2.90125582e-05, 6.21944768e-04, 9.97200956e-01, 2.00016885e-03,
        2.35997451e-05, 1.24317920e-04],
       [7.44227289e-02, 2.90658654e-04, 9.21294149e-01, 1.77669054e-05,
        3.04955546e-03, 9.25140950e-04],
       [4.37109240e-04, 8.17750318e-04, 2.92997825e-02, 9.61531205e-01,
        6.19681680e-04, 7.29447154e-03],
       [6.76732996e-03, 8.80727256e-05, 6.61725061e-01, 3.15401579e-01,
        6.70709170e-04, 1.53472480e-02],
       [3.03366036e-04, 6.37919082e-04, 9.96803376e-01, 2.00180769e-03,
        2.30372748e-05, 2.30493514e-04],
       [1.62759682e-05, 2.66402505e-04, 9.99055098e-01, 5.76444116e-04,
        6.23834259e-06, 7.95407205e-05],
       [6.66578144e-01, 4.91797714e-03, 3.13965437e-01, 9.63261780e-03,
        2.35653490e-03, 2.54928996e-03],
       [1.71730008e-04, 2.54492658e-01, 5.88102462e-01, 1.56945857e-01,
        3.48892900e-06, 2.83803912e-04],
       [5.06756636e-02, 8.02618733e-05, 8.83821422e-05, 9.48852913e-01,
        1.25132112e-05, 

In [23]:
import numpy as np
pos=[]
large=[]
ind = []
i=0
for j in pred_unlabel_1:
  if max(j)> 0.98:
    ind.append(np.argmax(j))
    large.append(max(j))
    pos.append(i)
  i+=1


print(ind)
print(large)
print(pos)
print(len(ind))
print(len(large))
print(len(pos))

[2, 2, 2, 0, 2, 2, 2, 2, 2, 3, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 5, 3]
[0.9972009561635322, 0.9968033764008846, 0.9990550983485111, 0.9846864068053934, 0.9964028662013704, 0.9987232012429391, 0.9972716934030655, 0.997120193387933, 0.9942954139845491, 0.9888002621978882, 0.9947159627795978, 0.9997980608458925, 0.9923310687124642, 0.9910361127046684, 0.9912806153693244, 0.9889905882770152, 0.9982077483665502, 0.9985271534027402, 0.9975814279203458, 0.9983908441104118, 0.9908782020294689, 0.992734639826663, 0.996760626245344, 0.9996826952219955, 0.9972584861965235, 0.9936862424286748, 0.9892990736511916, 0.9801128440281638, 0.9957056106032011, 0.9901438480285365, 0.9807303254006517, 0.9996136557206662, 0.9808348153233375, 0.9906975016264579, 0.990054520977255, 0.9983853517860302, 0.99351757448055, 0.9979769673547184, 0.9802274765520383, 0.9910193489192369]
[0, 4, 5, 10, 11, 12, 17, 18, 23, 25, 29, 31, 32, 33, 34, 35, 39, 40, 45, 48, 50, 51

In [24]:
unlabel_1 = unlabel_1.loc[[0,4,5,10,11,12,17,18,23,25,29,31,32,33,34,35,39,40,45,48,50,51,55,56,62,65,68,69,70,74,75,76,77,87,89,94,95,96,97,98],:]
unlabel_1.head()

Unnamed: 0.1,Unnamed: 0,Sentence,Target
0,2173,darden sr testified âœi saw staggering â prett...,Facts
4,79,eligible leaves renew contacts family,Facts
5,1955,got downstairs ponce went car wife left home,Facts
10,2262,first says evidence support charge comment upo...,Issue
11,1604,police notified deceased found lying mrs moers...,Facts


In [25]:
frame_1 = [train_kappa,unlabel_1]
train_1 = pd.concat(frame_1)
len(train_1)

2991

In [32]:
x_train_1 = tfidf_vect.transform(train_1['Sentence'])
x_train_1.shape
X_train, x_val, Y_train, y_val = train_test_split(x_train_1,train_1['Target'],test_size=0.2,random_state=42)
support = svm.LinearSVC()
clf = support.fit(X_train, Y_train)
calibrator = CalibratedClassifierCV(clf, cv='prefit')
model_1=calibrator.fit(X_train, Y_train)
pred_svm = model_1.predict(x_val)
print('Accuracy %s' % accuracy_score(pred_svm,y_val))
print(classification_report(y_val,pred_svm))

Accuracy 0.672787979966611
                  precision    recall  f1-score   support

        Analysis       0.53      0.48      0.50        82
      Conclusion       0.62      0.39      0.48        33
           Facts       0.78      0.85      0.81       325
         Invalid       0.46      0.50      0.48        88
           Issue       0.67      0.49      0.56        37
Rule/Law/Holding       0.46      0.38      0.42        34

        accuracy                           0.67       599
       macro avg       0.59      0.51      0.54       599
    weighted avg       0.66      0.67      0.67       599



In [33]:
unlabel_2.head()

Unnamed: 0.1,Unnamed: 0,Sentence,Target
0,1207,examination occurred night middle may testifie...,Invalid
1,3400,undisputed testimony reflects appellant drivin...,Facts
2,2072,appellant testify offer evidence behalf,Facts
3,517,56 uniform act regulating traffic highways gis...,Rule/Law/Holding
4,906,agreed accompany gave pawn ticket money reclai...,Facts


In [34]:
x_un2 = tfidf_vect.transform(unlabel_2['Sentence'])
pred_unlabel_2 = model_1.predict_proba(x_un2)
pred_unlabel_2

array([[2.06748742e-03, 9.03589119e-05, 9.87340550e-01, 1.04563611e-02,
        1.15391528e-05, 3.37037210e-05],
       [2.33325180e-04, 3.15596716e-04, 9.09400536e-01, 8.99099838e-02,
        2.38642297e-05, 1.16693953e-04],
       [4.56658640e-04, 1.55974037e-04, 9.98560626e-01, 6.22265229e-04,
        2.01642249e-04, 2.83368170e-06],
       [8.72362810e-04, 1.72896629e-04, 9.82703403e-01, 1.26955910e-02,
        3.95329829e-06, 3.55179311e-03],
       [1.89277466e-05, 9.69955270e-04, 9.91169621e-01, 2.79005996e-04,
        4.79296243e-06, 7.55769719e-03],
       [5.04540073e-01, 4.03727947e-03, 3.94073568e-01, 1.96825679e-03,
        9.53203399e-02, 6.04821755e-05],
       [5.51416855e-03, 3.35400613e-04, 9.92865827e-01, 1.09269770e-03,
        1.57455016e-04, 3.44513272e-05],
       [1.75139731e-04, 4.52467374e-04, 9.98859401e-01, 4.39141092e-04,
        5.56840326e-05, 1.81663997e-05],
       [7.84146599e-04, 1.77318774e-04, 9.98197749e-01, 5.67012292e-04,
        6.57605597e-05, 

In [35]:
pos=[]
large=[]
ind = []
i=0
for j in pred_unlabel_2:
  if max(j)> 0.98:
    ind.append(np.argmax(j))
    large.append(max(j))
    pos.append(i)
  i+=1


print(ind)
print(large)
print(pos)
print(len(ind))
print(len(large))
print(len(pos))

[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1, 2, 3, 2, 1, 2, 2, 2, 2, 5, 2, 2, 2, 2, 3, 2, 2, 2, 2, 4, 2]
[0.9873405497057722, 0.998560626163031, 0.9827034031812859, 0.9911696208368082, 0.9928658267928504, 0.9988594013713826, 0.998197749276386, 0.9997882226723475, 0.9906703395163713, 0.9846079264506703, 0.9989437334134439, 0.9887551383007501, 0.9876975034522971, 0.995316796238503, 0.999718039614436, 0.9904545424545884, 0.9982607676891384, 0.9997646759097027, 0.9966665420138819, 0.9973738382902736, 0.9957575187749984, 0.9991700530769136, 0.9981015808316438, 0.9993897274550881, 0.9874435154037188, 0.998076485514705, 0.99922385970357, 0.9882768825768523, 0.9922662943430366, 0.9970471294758356, 0.9829688578390461, 0.987941570685464, 0.9995107522934077, 0.9932617017784631, 0.9907911558885182, 0.9903816959901347]
[0, 2, 3, 4, 6, 7, 8, 12, 20, 21, 29, 32, 34, 36, 37, 38, 39, 42, 44, 49, 50, 59, 63, 64, 68, 69, 70, 71, 75, 83, 85, 86, 88, 91, 96, 99]
36
36
36


In [36]:
unlabel_2 = unlabel_2.loc[[0, 2, 3, 4, 6, 7, 8, 12, 20, 21, 29, 32, 34, 36, 37, 38, 39, 42, 44, 49, 50, 59, 63, 64, 68, 69, 70, 71, 75, 83, 85, 86, 88, 91, 96, 99 ],:]
unlabel_2.head()

Unnamed: 0.1,Unnamed: 0,Sentence,Target
0,1207,examination occurred night middle may testifie...,Invalid
2,2072,appellant testify offer evidence behalf,Facts
3,517,56 uniform act regulating traffic highways gis...,Rule/Law/Holding
4,906,agreed accompany gave pawn ticket money reclai...,Facts
6,570,appellant performed work asked 7500 money ther...,Facts


In [37]:
frame_2 = [train_1,unlabel_2]
train_2 = pd.concat(frame_2)
len(train_2)

3027

In [45]:
x_train_2 = tfidf_vect.transform(train_2['Sentence'])
X_train, x_val, Y_train, y_val = train_test_split(x_train_2,train_2['Target'],test_size=0.25,random_state=2)
support = svm.LinearSVC()
clf = support.fit(X_train, Y_train)
calibrator = CalibratedClassifierCV(clf, cv='prefit')
model_2=calibrator.fit(X_train, Y_train)
pred_svm = model_2.predict(x_val)
print('Accuracy %s' % accuracy_score(pred_svm,y_val))
print(classification_report(y_val,pred_svm))

Accuracy 0.6803170409511229
                  precision    recall  f1-score   support

        Analysis       0.54      0.50      0.52       115
      Conclusion       0.63      0.47      0.54        40
           Facts       0.76      0.89      0.82       385
         Invalid       0.55      0.49      0.52       136
           Issue       0.65      0.42      0.52        40
Rule/Law/Holding       0.54      0.32      0.40        41

        accuracy                           0.68       757
       macro avg       0.61      0.52      0.55       757
    weighted avg       0.67      0.68      0.67       757



In [46]:
x_un3 = tfidf_vect.transform(unlabel_3['Sentence'])
pred_unlabel_3 = model_2.predict_proba(x_un3)
pred_unlabel_3

array([[9.58422410e-01, 4.13340318e-02, 3.53740314e-06, 2.08736383e-04,
        2.04973022e-05, 1.07875751e-05],
       [5.32082362e-01, 2.20338334e-03, 1.16858829e-01, 3.48793909e-01,
        5.40345256e-05, 7.48196452e-06],
       [1.92764334e-05, 2.91484317e-05, 9.93027573e-01, 6.69573158e-03,
        9.25811192e-05, 1.35689634e-04],
       [3.61941922e-05, 2.32562381e-03, 9.94735670e-01, 2.73748771e-03,
        3.81245640e-05, 1.26899894e-04],
       [1.62066372e-04, 1.30258817e-04, 9.93698914e-01, 5.47675309e-03,
        1.91015825e-05, 5.12906059e-04],
       [2.03781792e-02, 2.79502341e-05, 2.00196794e-01, 7.63593812e-01,
        1.57827434e-02, 2.05203907e-05],
       [9.47746411e-04, 5.53963225e-04, 9.45358733e-01, 5.22760199e-02,
        8.17090236e-04, 4.64469578e-05],
       [2.55381675e-03, 5.26414559e-04, 9.96866123e-01, 3.07901974e-05,
        1.58778060e-06, 2.12678995e-05],
       [1.23858359e-03, 1.80587826e-04, 9.97793695e-01, 1.13870558e-05,
        4.80172521e-07, 

In [47]:
pos=[]
large=[]
ind = []
i=0
for j in pred_unlabel_3:
  if max(j)> 0.98:
    ind.append(np.argmax(j))
    large.append(max(j))
    pos.append(i)
  i+=1


print(ind)
print(large)
print(pos)
print(len(ind))
print(len(large))
print(len(pos))

[2, 2, 2, 2, 2, 3, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 3, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 2, 0, 2, 2]
[0.9930275727990354, 0.9947356698274391, 0.9936989140746526, 0.9968661228116026, 0.9977936945013316, 0.9812120010817951, 0.9990802164846375, 0.9870769356407149, 0.9906248945188626, 0.9995772334481631, 0.997812975335988, 0.9993530870885573, 0.9907365483564019, 0.9978642667615891, 0.9928708120098781, 0.9991577434907032, 0.9932221136005938, 0.9863989428709641, 0.998397070302069, 0.9984347461228904, 0.9957982982551646, 0.9988450314299517, 0.9924171519856929, 0.9981350594938138, 0.9902040942481136, 0.9966407541628297, 0.9978386534332022, 0.9995000914014914, 0.9996667552853625, 0.9905811847362126, 0.9996277302118475, 0.9979238550443501, 0.993983158372029, 0.9992421172514936, 0.997632904920351, 0.9987374684729201, 0.9857251461869773, 0.9977702344461576, 0.9880718119862245, 0.9975991600391366, 0.989518500528339, 0.9990646520285288, 0.9975418815877415, 0.

In [48]:
unlabel_3 = unlabel_3.loc[[2, 3, 4, 7, 8, 9, 12, 15, 18, 20, 26, 27, 29, 30, 34, 35, 36, 40, 43, 46, 47, 48, 49, 51, 52, 53, 54, 56, 63, 64, 65, 66, 67, 68, 70, 72, 73, 75, 78, 79, 87, 89, 90, 91, 92, 96, 97],:]
unlabel_3.head()

Unnamed: 0.1,Unnamed: 0,Sentence,Target
2,1417,certification attending physician shown filed ...,Facts
3,2932,testified agent bland first left automobile wa...,Facts
4,1584,officer testified could smell odor alcohol app...,Facts
7,2115,told get back bed said floor cool sick stomach...,Facts
8,1278,testified first time would come bed week altho...,Facts


In [57]:
frame_3 = [train_2,unlabel_3]
train_3 = pd.concat(frame_3)
len(train_3)
x_train_3 = tfidf_vect.transform(train_3['Sentence'])
X_train, x_val, Y_train, y_val = train_test_split(x_train_3,train_3['Target'],test_size=0.3,random_state=42)
support = svm.LinearSVC()
clf = support.fit(X_train, Y_train)
calibrator = CalibratedClassifierCV(clf, cv='prefit')
model_3=calibrator.fit(X_train, Y_train)
pred_svm = model_3.predict(x_val)
print('Accuracy %s' % accuracy_score(pred_svm,y_val))
print(classification_report(y_val,pred_svm))

Accuracy 0.6717226435536294
                  precision    recall  f1-score   support

        Analysis       0.47      0.44      0.46       125
      Conclusion       0.78      0.51      0.62        55
           Facts       0.78      0.89      0.83       499
         Invalid       0.50      0.43      0.46       144
           Issue       0.47      0.43      0.45        46
Rule/Law/Holding       0.34      0.19      0.24        54

        accuracy                           0.67       923
       macro avg       0.56      0.48      0.51       923
    weighted avg       0.65      0.67      0.66       923



In [58]:
x_un4 = tfidf_vect.transform(unlabel_4['Sentence'])
pred_unlabel_4 = model_3.predict_proba(x_un4)
pred_unlabel_4

array([[1.29469578e-04, 4.18861869e-04, 8.13557843e-04, 9.85213072e-01,
        1.05467942e-03, 1.23703596e-02],
       [1.08198931e-02, 9.88125594e-03, 2.03661198e-01, 6.89042075e-01,
        8.64497776e-02, 1.45800280e-04],
       [9.96320109e-01, 3.17454329e-03, 2.23140855e-04, 5.38114329e-07,
        2.46781647e-04, 3.48868669e-05],
       [1.07134864e-03, 4.81388228e-03, 7.25796985e-05, 9.92562905e-01,
        1.03256993e-04, 1.37602779e-03],
       [3.63293367e-06, 1.94456537e-03, 9.96148268e-01, 1.67797023e-03,
        2.07289783e-04, 1.82733620e-05],
       [3.37047389e-03, 1.66459218e-03, 1.95164794e-03, 1.00285329e-03,
        7.21001810e-01, 2.71008623e-01],
       [8.19646790e-05, 1.44270378e-04, 8.17196642e-06, 4.76616113e-03,
        6.00503826e-02, 9.34949049e-01],
       [1.15349464e-05, 2.06565485e-03, 9.97829933e-01, 7.73917568e-05,
        2.82775740e-06, 1.26577024e-05],
       [1.56908408e-05, 1.18274068e-04, 9.95974844e-01, 3.74753110e-03,
        8.94926872e-05, 

In [59]:
pos=[]
large=[]
ind = []
i=0
for j in pred_unlabel_4:
  if max(j)> 0.98:
    ind.append(np.argmax(j))
    large.append(max(j))
    pos.append(i)
  i+=1


print(ind)
print(large)
print(pos)
print(len(ind))
print(len(large))
print(len(pos))

[3, 0, 3, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 3, 2, 5, 2, 2, 2, 2, 2, 1, 0, 0, 2, 0, 2, 2, 3, 2, 2, 2, 3, 0, 2, 2, 2, 2, 0, 2, 2, 2, 2, 5, 2]
[0.9852130717131916, 0.996320109231326, 0.9925629045963505, 0.9961482683237932, 0.9978299329828588, 0.9959748441017648, 0.9983175598223651, 0.9840788879051711, 0.9810789327707822, 0.9987945388487491, 0.9901618485256424, 0.9830221005799373, 0.9974798729541411, 0.9984886288342216, 0.9979120887683947, 0.9995480544660423, 0.993255178766798, 0.9991948667856558, 0.9998754495958827, 0.998797504694732, 0.9855501403853646, 0.9969724412274169, 0.9830400064906508, 0.9948367655046149, 0.9985667058595954, 0.9907925420139255, 0.9823740306295514, 0.9894433620661569, 0.989365319102458, 0.9913869135779727, 0.9826949793959684, 0.9805720111108018, 0.9862040299559365, 0.9957475880484471, 0.9934342521365979, 0.985170901474226, 0.998688392603582, 0.9966847554826556, 0.9964790797215283, 0.9917649681378384, 0.9881826429578061, 0.9996521903578474, 0.995023491458

In [60]:
unlabel_4 = unlabel_4.loc[[0, 2, 3, 4, 7, 8, 10, 11, 15, 18, 20, 21, 22, 27, 28, 29, 30, 31, 32, 34, 36, 37, 38, 39, 40, 42, 45, 46, 48, 50, 54, 55, 56, 58, 60, 61, 63, 66, 67, 69, 70, 71, 72, 73, 75, 85, 87, 93, 94, 99 ],:]
unlabel_4.head()

Unnamed: 0.1,Unnamed: 0,Sentence,Target
0,266,section 6 pl,Invalid
2,1471,evidence sufficient support conviction appella...,Analysis
3,1990,cases cited note 19,Invalid
4,2757,july 8 1960 order entered revoking said probat...,Facts
7,765,time ambulance police called ruby bickmore rem...,Facts


In [64]:
frame_4 = [train_3,unlabel_4]
train_4 = pd.concat(frame_4)
len(train_4)
x_train_4 = tfidf_vect.transform(train_4['Sentence'])
X_train, x_val, Y_train, y_val = train_test_split(x_train_4,train_4['Target'],test_size=0.18,random_state=42)
support = svm.LinearSVC()
clf = support.fit(X_train, Y_train)
calibrator = CalibratedClassifierCV(clf, cv='prefit')
model_4=calibrator.fit(X_train, Y_train)
pred_svm = model_4.predict(x_val)
print('Accuracy %s' % accuracy_score(pred_svm,y_val))
print(classification_report(y_val,pred_svm))

Accuracy 0.7015985790408525
                  precision    recall  f1-score   support

        Analysis       0.50      0.46      0.48        70
      Conclusion       0.61      0.47      0.53        30
           Facts       0.81      0.92      0.86       306
         Invalid       0.47      0.50      0.49        84
           Issue       0.56      0.44      0.49        32
Rule/Law/Holding       0.87      0.32      0.46        41

        accuracy                           0.70       563
       macro avg       0.64      0.52      0.55       563
    weighted avg       0.70      0.70      0.69       563



In [65]:
x_un5 = tfidf_vect.transform(unlabel_5['Sentence'])

pred_unlabel_5 = model_4.predict_proba(x_un5)
pred_unlabel_5

array([[3.47839756e-01, 2.96622740e-01, 9.01815655e-05, 2.31916267e-01,
        9.44695583e-02, 2.90614975e-02],
       [8.80320365e-01, 2.71389029e-04, 1.69418277e-02, 3.59935102e-02,
        8.44173884e-06, 6.64644667e-02],
       [2.87776771e-04, 2.90171758e-03, 9.38695941e-03, 1.32651696e-01,
        8.50780244e-01, 3.99160633e-03],
       [2.24282052e-04, 4.01667431e-04, 9.98884359e-01, 3.81666400e-04,
        6.87301253e-05, 3.92950865e-05],
       [4.74247023e-02, 5.19689597e-03, 5.93616977e-01, 3.53537141e-01,
        5.07648946e-05, 1.73519340e-04],
       [2.10510295e-03, 1.91609669e-03, 7.88509935e-01, 2.07164166e-01,
        7.81131898e-05, 2.26586712e-04],
       [5.87280576e-01, 1.49013203e-01, 3.15400750e-03, 1.96295279e-02,
        2.35180557e-01, 5.74212732e-03],
       [3.87215083e-04, 2.43959962e-04, 9.96357540e-01, 2.88163394e-03,
        4.63538517e-05, 8.32969590e-05],
       [4.96142075e-04, 2.79519003e-04, 2.33903997e-01, 7.50528184e-01,
        1.67535933e-04, 

In [66]:
pos=[]
large=[]
ind = []
i=0
for j in pred_unlabel_5:
  if max(j)> 0.98:
    ind.append(np.argmax(j))
    large.append(max(j))
    pos.append(i)
  i+=1


print(ind)
print(large)
print(pos)
print(len(ind))
print(len(large))
print(len(pos))

[2, 2, 2, 5, 3, 2, 2, 2, 2, 2, 1, 1, 3, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 3, 0, 0, 2]
[0.9988843589047455, 0.9963575402073694, 0.9807631741625501, 0.9942205379257215, 0.9993405042765197, 0.9894685205253773, 0.9826284975834824, 0.9988915474883969, 0.9995796739938198, 0.998425236717666, 0.9997669466853755, 0.9930432340855844, 0.9905704811329934, 0.9865547583354342, 0.9862809980994308, 0.9996445187585361, 0.9930635528278839, 0.9975848392414258, 0.9951204138607904, 0.9901887329057516, 0.9902296003121398, 0.986172725228249, 0.9827454456294094, 0.9862523346704901, 0.9998893123269155, 0.9923693469096124, 0.9996252609081542, 0.9968703904702425, 0.9989914684719491, 0.9989712636212011, 0.9980348217374974, 0.9971379252105111, 0.9930211181825325, 0.9804954529919857, 0.9888810447950914, 0.9882416776853894, 0.9978147470781918, 0.9818963730694658]
[3, 7, 9, 18, 19, 20, 22, 23, 26, 32, 36, 46, 48, 50, 52, 53, 59, 60, 63, 65, 73, 75, 77, 78, 82, 85, 86, 91, 94, 95, 97, 100,

In [67]:
unlabel_5 = unlabel_5.loc[[3, 7, 9, 18, 19, 20, 22, 23, 26, 32, 36, 46, 48, 50, 52, 53, 59, 60, 63, 65, 73, 75, 77, 78, 82, 85, 86, 91, 94, 95, 97, 100, 102, 103, 107, 109, 110, 116],:]
unlabel_5.head()

Unnamed: 0.1,Unnamed: 0,Sentence,Target
3,658,appellant needle marks inside left elbow,Facts
7,2005,âœall male persons habitually associate prosti...,Invalid
9,3087,appellant urges error failure trial court admo...,Issue
18,1438,conviction possession heroin punishment twenty...,Rule/Law/Holding
19,1034,stateâs brief cited massey v,Invalid


In [82]:
frame_5 = [train_4,unlabel_5]
train_5 = pd.concat(frame_5)
len(train_5)
x_train_5 = tfidf_vect.transform(train_5['Sentence'])
X_train, x_val, Y_train, y_val = train_test_split(x_train_5,train_5['Target'],test_size=0.3,random_state=42)
support = svm.LinearSVC()
clf = support.fit(X_train, Y_train)
calibrator = CalibratedClassifierCV(clf, cv='prefit')
model_5=calibrator.fit(X_train, Y_train)
pred_svm = model_5.predict(x_val)
print('Accuracy %s' % accuracy_score(pred_svm,y_val))
print(classification_report(y_val,pred_svm))

Accuracy 0.6828240252897787
                  precision    recall  f1-score   support

        Analysis       0.46      0.42      0.44       123
      Conclusion       0.66      0.49      0.56        47
           Facts       0.78      0.90      0.84       521
         Invalid       0.50      0.48      0.49       144
           Issue       0.61      0.46      0.52        50
Rule/Law/Holding       0.48      0.22      0.30        64

        accuracy                           0.68       949
       macro avg       0.58      0.49      0.52       949
    weighted avg       0.66      0.68      0.67       949



In [83]:
t_p = tfidf_vect.transform(test['Sentence'])
test_pred = model_5.predict(t_p)
print('Accuracy %s' % accuracy_score(test_pred,test['Target']))

Accuracy 0.6660268714011516
