<a href="https://colab.research.google.com/github/sundarp17/sundar_info5731_fall2020/blob/master/Highkappa_unbalanced_high_probability.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

train_kappa = pd.read_csv(r'/content/train_data_highkappa.csv')
train_kappa.head()

Unnamed: 0.1,Unnamed: 0,Sentence,Target
0,659,Appellant had stated to the officers that she ...,Invalid
1,3456,We shall discuss the facts more fully in conne...,Others
2,2043,"â€œPerjury is a false statement, either writte...",Invalid
3,3344,The offense is felony theft by false pretext; ...,Issue
4,3231,Numerous contentions urging the commission of ...,Issue


In [None]:
train_kappa['Target'].unique()

array(['Invalid', 'Others', 'Issue', 'Analysis', 'Facts', 'Conclusion',
       'Rule/Law/Holding'], dtype=object)

In [None]:
train_kappa['Target']=train_kappa['Target'].replace(['Others'],'Invalid')
train_kappa['Target'].unique()

array(['Invalid', 'Issue', 'Analysis', 'Facts', 'Conclusion',
       'Rule/Law/Holding'], dtype=object)

In [None]:
#cleaning
import nltk
import re
import string
nltk.download('stopwords')
nltk.download('wordnet')
stopword=nltk.corpus.stopwords.words('english')
from nltk.stem import WordNetLemmatizer
wl= WordNetLemmatizer()

def clean_text(text):
  text="".join([word.lower() for word in text if word not in string.punctuation])
  tokens = re.split('\W+',text)
  text = [wl.lemmatize(word) for word in tokens if word not in stopword]
  return text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(analyzer = clean_text)
X_tfidf = tfidf_vect.fit_transform(train_kappa['Sentence'])
print(X_tfidf.shape)

(2951, 5835)


In [None]:
#using smote and stochastic gradient boosting
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.calibration import CalibratedClassifierCV


X_train, x_val, Y_train, y_val = train_test_split(X_tfidf,train_kappa['Target'],test_size=0.26,random_state=42)
sgd = SGDClassifier(max_iter=1000, tol=1e-3,loss='hinge',class_weight='balanced')
clf = sgd.fit(X_train, Y_train)
calibrator = CalibratedClassifierCV(clf, cv='prefit')
model=calibrator.fit(X_train, Y_train)
pred_sgd = model.predict(x_val)
print('Accuracy %s' % accuracy_score(pred_sgd,y_val))
print(classification_report(y_val,pred_sgd))

Accuracy 0.66796875
                  precision    recall  f1-score   support

        Analysis       0.56      0.46      0.51       104
      Conclusion       0.63      0.46      0.54        41
           Facts       0.75      0.88      0.81       416
         Invalid       0.45      0.49      0.47       109
           Issue       0.67      0.33      0.44        55
Rule/Law/Holding       0.43      0.23      0.30        43

        accuracy                           0.67       768
       macro avg       0.58      0.47      0.51       768
    weighted avg       0.65      0.67      0.65       768



In [None]:
def frequency_table(data):
    frequencytable = {}
    for key in data:
        if key in frequencytable:
            frequencytable[key] += 1
        else:
            frequencytable[key] = 1
    return frequencytable

frequency_table(train_kappa['Target'])



{'Analysis': 439,
 'Conclusion': 147,
 'Facts': 1510,
 'Invalid': 475,
 'Issue': 190,
 'Rule/Law/Holding': 190}

In [None]:
test = pd.read_csv(r'/content/test_data.csv')
test.head()

Unnamed: 0.1,Unnamed: 0,Sentence,Target
0,2173,Darden Sr. testified â€œI saw he was staggerin...,Facts
1,466,Additional moral justification may have been d...,Analysis
2,525,"85, 22 S.W. 140, wherein the accused was actin...",Rule/Law/Holding
3,2199,Under a proper instruction from the court on m...,Facts
4,79,He is eligible for leaves to renew contacts wi...,Facts


In [None]:
test['Target'].unique()

array(['Facts', 'Analysis', 'Rule/Law/Holding', 'Conclusion', 'Invalid',
       'Issue', 'Others'], dtype=object)

In [None]:
test['Target']=test['Target'].replace(['Others'],'Invalid')
test['Target'].unique()

array(['Facts', 'Analysis', 'Rule/Law/Holding', 'Conclusion', 'Invalid',
       'Issue'], dtype=object)

In [None]:
test['Sentence'] = test['Sentence'].apply(lambda x: " ".join(x.lower() for x in str(x).split()))
test['Sentence'] = test['Sentence'].str.replace('[^\w\s]','')
from nltk.corpus import stopwords
words = stopwords.words('english')
test['Sentence'] = test['Sentence'].apply(lambda x: " ".join(x for x in x.split() if x not in words))

In [None]:
from textblob import TextBlob
from textblob import Word
nltk.download('wordnet')
nltk.download('punkt')
test['Sentence'] = test['Sentence'].apply(lambda x: TextBlob(x).words)
test['Sentence'] = test['Sentence'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x]))

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
test['Sentence'].head()

0    darden sr testified âœi saw staggering â prett...
1    additional moral justification may due complai...
2    85 22 sw 140 wherein accused acting upon advic...
3    proper instruction court murder malice murder ...
4                   eligible leaf renew contact family
Name: Sentence, dtype: object

In [None]:
len(test['Sentence'])

521

In [None]:
unlabel_1 = test.loc[:100]
print("length of unlabel_1",len(unlabel_1))
unlabel_2 = test.loc[101:200]
print("length of unlabel_2",len(unlabel_2))
unlabel_3 = test.loc[201:300]
print("length of unlabel_3",len(unlabel_3))
unlabel_4 = test.loc[301:400]
print("length of unlabel_4",len(unlabel_4))
unlabel_5 = test.loc[401:]
print("length of unlabel_5",len(unlabel_5))

length of unlabel_1 101
length of unlabel_2 100
length of unlabel_3 100
length of unlabel_4 100
length of unlabel_5 120


In [None]:
unlabel_1.head()

Unnamed: 0.1,Unnamed: 0,Sentence,Target
0,2173,darden sr testified âœi saw staggering â prett...,Facts
1,466,additional moral justification may due complai...,Analysis
2,525,85 22 sw 140 wherein accused acting upon advic...,Rule/Law/Holding
3,2199,proper instruction court murder malice murder ...,Facts
4,79,eligible leaf renew contact family,Facts


In [None]:
unlabel_2.reset_index(inplace=True)
del unlabel_2['index']
unlabel_2.head()

Unnamed: 0.1,Unnamed: 0,Sentence,Target
0,1207,examination occurred night middle may testifie...,Invalid
1,3400,undisputed testimony reflects appellant drivin...,Facts
2,2072,appellant testify offer evidence behalf,Facts
3,517,56 uniform act regulating traffic highway gist...,Rule/Law/Holding
4,906,agreed accompany gave pawn ticket money reclai...,Facts


In [None]:
unlabel_3.reset_index(inplace=True)
del unlabel_3['index']
unlabel_3.head()

Unnamed: 0.1,Unnamed: 0,Sentence,Target
0,1882,find evidence amply sufficient sustain juryâs ...,Analysis
1,964,action show thatâ,Invalid
2,1417,certification attending physician shown filed ...,Facts
3,2932,testified agent bland first left automobile wa...,Facts
4,1584,officer testified could smell odor alcohol app...,Facts


In [None]:
unlabel_4.reset_index(inplace=True)
del unlabel_4['index']
unlabel_4.head()

Unnamed: 0.1,Unnamed: 0,Sentence,Target
0,266,section 6 pl,Invalid
1,3469,amended answer appellant alleged bond invalid ...,Analysis
2,1471,evidence sufficient support conviction appella...,Analysis
3,1990,case cited note 19,Invalid
4,2757,july 8 1960 order entered revoking said probat...,Facts


In [None]:
unlabel_5.reset_index(inplace=True)
del unlabel_5['index']
unlabel_5.head()

Unnamed: 0.1,Unnamed: 0,Sentence,Target
0,2979,overrule appellantâs contention court erred pe...,Analysis
1,2135,state regarded authority case,Invalid
2,1591,testimony adduced appellant mother lillie mae ...,Facts
3,658,appellant needle mark inside left elbow,Facts
4,2562,analysis content 3 capsule chemist kenneth and...,Facts


In [None]:
model.classes_

array(['Analysis', 'Conclusion', 'Facts', 'Invalid', 'Issue',
       'Rule/Law/Holding'], dtype='<U16')

In [None]:
x_un1 = tfidf_vect.transform(unlabel_1['Sentence'])
pred_unlabel_1 = model.predict_proba(x_un1)
pred_unlabel_1

array([[3.03412070e-04, 5.26175762e-04, 9.93505532e-01, 5.54680417e-03,
        2.31607084e-06, 1.15759910e-04],
       [2.39288256e-01, 1.70128268e-04, 7.60144367e-01, 3.60763504e-05,
        2.49816749e-05, 3.36190862e-04],
       [7.53371028e-04, 2.63245497e-04, 3.53534254e-02, 9.61121267e-01,
        1.15252917e-03, 1.35616198e-03],
       [4.88186597e-01, 2.39416888e-07, 5.04803747e-01, 8.52447948e-04,
        7.09938674e-06, 6.14986968e-03],
       [2.04526651e-03, 1.43504664e-04, 9.95041971e-01, 2.69189869e-03,
        1.91668555e-05, 5.81921143e-05],
       [9.55507324e-05, 8.89996984e-05, 9.98072916e-01, 1.61123764e-03,
        2.48000321e-05, 1.06496377e-04],
       [5.84487060e-04, 1.53753195e-04, 3.61178102e-01, 6.37842550e-01,
        2.16537925e-04, 2.45699066e-05],
       [2.38464780e-05, 5.97754541e-01, 2.87489116e-02, 3.73463223e-01,
        3.35747166e-07, 9.14195098e-06],
       [1.50683255e-03, 2.42287077e-05, 3.10793979e-02, 9.67304764e-01,
        1.10628320e-05, 

In [None]:
import numpy as np
pos=[]
large=[]
ind = []
i=0
for j in pred_unlabel_1:
  if max(j)> 0.98:
    ind.append(np.argmax(j))
    large.append(max(j))
    pos.append(i)
  i+=1


print(ind)
print(large)
print(pos)
print(len(ind))
print(len(large))
print(len(pos))

[2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 3]
[0.9935055320185071, 0.9950419711631091, 0.9980729155190468, 0.9990206087250071, 0.9957947079613816, 0.986192579874566, 0.9828051372648968, 0.9935290062915814, 0.9939290250290735, 0.9813131149552955, 0.9844542590365958, 0.9970126610520873, 0.9971601297356079, 0.9846315969719541, 0.9876684866325023, 0.9937095298238904, 0.9968999234719642, 0.9996398076985459, 0.9872856947703561, 0.9946586111942958, 0.9984945388185681, 0.9971370325268678, 0.998124702314649, 0.9826672258186224, 0.9849364502558071, 0.9957063832191267, 0.9985282266718613, 0.9976296480060459, 0.9984519284216712, 0.9903165084584724]
[0, 4, 5, 11, 12, 17, 20, 21, 23, 29, 30, 31, 33, 35, 39, 40, 45, 48, 50, 56, 62, 65, 76, 79, 80, 89, 94, 95, 96, 98]
30
30
30


In [None]:
unlabel_1 = unlabel_1.loc[[0,4,5,11,12,17,20,21,23,29,30,31,33,35,39,40,45,48,50,56,62,65,76,79,80,89,94,95,96,98],:]
unlabel_1.head()

Unnamed: 0.1,Unnamed: 0,Sentence,Target
0,2173,darden sr testified âœi saw staggering â prett...,Facts
4,79,eligible leaf renew contact family,Facts
5,1955,got downstairs ponce went car wife left home,Facts
11,1604,police notified deceased found lying mr moersâ...,Facts
12,1102,shown vine street drug storeâs supply narcotic...,Facts


In [None]:
frame_1 = [train_kappa,unlabel_1]
train_1 = pd.concat(frame_1)
len(train_1)

2981

In [None]:
x_train_1 = tfidf_vect.transform(train_1['Sentence'])
x_train_1.shape
X_train, x_val, Y_train, y_val = train_test_split(x_train_1,train_1['Target'],test_size=0.25,random_state=2)
sgd = SGDClassifier(max_iter=1000, tol=1e-3,loss='hinge',class_weight='balanced')
clf = sgd.fit(X_train, Y_train)
calibrator = CalibratedClassifierCV(clf, cv='prefit')
model_1=calibrator.fit(X_train, Y_train)
pred_sgd = model_1.predict(x_val)
print('Accuracy %s' % accuracy_score(pred_sgd,y_val))
print(classification_report(y_val,pred_sgd))

Accuracy 0.6568364611260054
                  precision    recall  f1-score   support

        Analysis       0.46      0.40      0.42        96
      Conclusion       0.67      0.46      0.55        39
           Facts       0.73      0.90      0.81       398
         Invalid       0.50      0.41      0.45       129
           Issue       0.52      0.33      0.40        40
Rule/Law/Holding       0.67      0.18      0.29        44

        accuracy                           0.66       746
       macro avg       0.59      0.45      0.49       746
    weighted avg       0.64      0.66      0.63       746



In [None]:
unlabel_2.head()

Unnamed: 0.1,Unnamed: 0,Sentence,Target
0,1207,examination occurred night middle may testifie...,Invalid
1,3400,undisputed testimony reflects appellant drivin...,Facts
2,2072,appellant testify offer evidence behalf,Facts
3,517,56 uniform act regulating traffic highway gist...,Rule/Law/Holding
4,906,agreed accompany gave pawn ticket money reclai...,Facts


In [None]:
x_un2 = tfidf_vect.transform(unlabel_2['Sentence'])
pred_unlabel_2 = model_1.predict_proba(x_un2)
pred_unlabel_2

array([[5.72628693e-02, 1.29721660e-06, 6.53044765e-01, 2.89675467e-01,
        5.45072357e-06, 1.01511441e-05],
       [1.73803675e-02, 7.26015735e-06, 9.65100692e-01, 1.74750340e-02,
        2.30309893e-05, 1.36148588e-05],
       [3.20787982e-03, 6.17603029e-05, 9.95841965e-01, 6.72201861e-04,
        2.16092443e-04, 1.00203887e-07],
       [4.90743364e-03, 6.94297121e-05, 9.14849233e-01, 7.88162201e-02,
        1.42077716e-06, 1.35626322e-03],
       [1.35594784e-03, 3.11692227e-04, 9.85914118e-01, 1.64894112e-03,
        4.87361601e-06, 1.07644276e-02],
       [6.71843008e-01, 2.02142370e-04, 1.02952617e-01, 5.47737122e-02,
        1.70214334e-01, 1.41861971e-05],
       [1.07782119e-01, 1.96738815e-05, 8.88551313e-01, 3.36790080e-03,
        2.48174705e-04, 3.08188240e-05],
       [9.47991367e-04, 1.84932626e-04, 9.97260319e-01, 1.52706656e-03,
        7.85811691e-05, 1.10924332e-06],
       [5.02642831e-02, 1.63690661e-04, 9.05086577e-01, 1.44333686e-02,
        2.98544489e-02, 

In [None]:
pos=[]
large=[]
ind = []
i=0
for j in pred_unlabel_2:
  if max(j)> 0.98:
    ind.append(np.argmax(j))
    large.append(max(j))
    pos.append(i)
  i+=1


print(ind)
print(large)
print(pos)
print(len(ind))
print(len(large))
print(len(pos))

[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 3, 1, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
[0.9958419653670616, 0.9859141175885046, 0.9972603190374736, 0.9881712187194087, 0.999464517834786, 0.9944513820130505, 0.9858881202663298, 0.9977446550932919, 0.999696344487791, 0.9962537606982749, 0.9960594261034281, 0.9965090603491691, 0.9973804505387537, 0.9982332672975743, 0.9955231891601946, 0.9894664727571506, 0.9860855384889253, 0.999490860630111, 0.9935900088752679, 0.9995806290125435, 0.9935590526042122, 0.9900892691499836, 0.9877589427351785, 0.9871459110191689, 0.9946966420416423, 0.9986709953821614, 0.9853636384755653]
[2, 4, 7, 9, 12, 15, 20, 23, 29, 31, 36, 37, 39, 42, 49, 53, 54, 59, 63, 64, 69, 70, 75, 84, 85, 88, 91]
27
27
27


In [None]:
unlabel_2 = unlabel_2.loc[[2, 4, 7, 9, 12, 15, 20, 23, 29, 31, 36, 37, 39, 42, 49, 53, 54, 59, 63, 64, 69, 70, 75, 84, 85, 88, 91],:]
unlabel_2.head()

Unnamed: 0.1,Unnamed: 0,Sentence,Target
2,2072,appellant testify offer evidence behalf,Facts
4,906,agreed accompany gave pawn ticket money reclai...,Facts
7,3150,finding door facing street seen man walking pr...,Facts
9,2887,sole ground advanced setting aside courtâs ord...,Issue
12,2085,mr joyce wisnoski sister appellant testified v...,Facts


In [None]:
frame_2 = [train_1,unlabel_2]
train_2 = pd.concat(frame_2)
len(train_2)

3008

In [None]:
x_train_2 = tfidf_vect.transform(train_2['Sentence'])
X_train, x_val, Y_train, y_val = train_test_split(x_train_2,train_2['Target'],test_size=0.22,random_state=42)
sgd = SGDClassifier(max_iter=1000, tol=1e-3,loss='hinge',class_weight='balanced')
clf = sgd.fit(X_train, Y_train)
calibrator = CalibratedClassifierCV(clf, cv='prefit')
model_2=calibrator.fit(X_train, Y_train)
pred_sgd = model_2.predict(x_val)
print('Accuracy %s' % accuracy_score(pred_sgd,y_val))
print(classification_report(y_val,pred_sgd))

Accuracy 0.6570996978851964
                  precision    recall  f1-score   support

        Analysis       0.54      0.40      0.46        92
      Conclusion       0.65      0.37      0.47        35
           Facts       0.74      0.90      0.81       357
         Invalid       0.46      0.44      0.45       102
           Issue       0.55      0.31      0.39        39
Rule/Law/Holding       0.35      0.19      0.25        37

        accuracy                           0.66       662
       macro avg       0.55      0.44      0.47       662
    weighted avg       0.63      0.66      0.63       662



In [None]:
x_un3 = tfidf_vect.transform(unlabel_3['Sentence'])
pred_unlabel_3 = model_2.predict_proba(x_un3)
pred_unlabel_3

array([[9.93602895e-01, 4.56114765e-03, 7.87726990e-04, 1.01954229e-03,
        2.85295757e-05, 1.58286873e-07],
       [2.89067157e-01, 2.07941443e-04, 6.18733489e-01, 9.19822567e-02,
        6.64646138e-06, 2.50938319e-06],
       [4.74448861e-05, 1.02915547e-06, 6.83358539e-01, 3.16555242e-01,
        1.14365796e-05, 2.63083080e-05],
       [1.57317029e-03, 3.38828107e-04, 9.17043282e-01, 8.10039914e-02,
        2.19897505e-05, 1.87380423e-05],
       [2.15774869e-02, 4.04795058e-05, 9.67259900e-01, 1.10048966e-02,
        2.15057371e-05, 9.57312548e-05],
       [2.19101469e-03, 1.96875954e-04, 8.40668533e-02, 9.12090047e-01,
        1.44336811e-03, 1.18411010e-05],
       [2.45378426e-02, 5.24999989e-04, 5.58795001e-01, 4.15852737e-01,
        1.36109132e-04, 1.53310145e-04],
       [4.71121884e-03, 1.33400196e-04, 9.94696020e-01, 4.51796355e-04,
        2.87191682e-07, 7.27693102e-06],
       [3.34839125e-03, 3.05315810e-05, 9.95513096e-01, 9.12884261e-04,
        2.15000697e-07, 

In [None]:
pos=[]
large=[]
ind = []
i=0
for j in pred_unlabel_3:
  if max(j)> 0.98:
    ind.append(np.argmax(j))
    large.append(max(j))
    pos.append(i)
  i+=1


print(ind)
print(large)
print(pos)
print(len(ind))
print(len(large))
print(len(pos))

[0, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 3, 2, 2, 2, 2, 2, 2, 2, 2]
[0.9936028952137997, 0.9946960204883722, 0.9955130963509797, 0.9995436547492852, 0.9877310842070094, 0.9976407826583045, 0.9875114634599979, 0.9990726488822422, 0.9853767434441922, 0.9862318581799873, 0.9985480153449864, 0.9906541570816229, 0.9993762225919262, 0.9976481877828374, 0.9974554534973792, 0.9952382111464214, 0.9852914774439763, 0.997393087074503, 0.98845771638222, 0.9979835627350904, 0.9983868797570952, 0.9865321620555932, 0.9954154125414592, 0.9984048814540502, 0.9889992997130015, 0.9928201398809484, 0.9947540310484745]
[0, 7, 8, 12, 19, 20, 21, 27, 29, 30, 35, 36, 43, 46, 47, 48, 49, 53, 54, 57, 63, 64, 65, 68, 72, 78, 89]
27
27
27


In [None]:
unlabel_3 = unlabel_3.loc[[0, 7, 8, 12, 19, 20, 21, 27, 29, 30, 35, 36, 43, 46, 47, 48, 49, 53, 54, 57, 63, 64, 65, 68, 72, 78, 89],:]
unlabel_3.head()

Unnamed: 0.1,Unnamed: 0,Sentence,Target
0,1882,find evidence amply sufficient sustain juryâs ...,Analysis
7,2115,told get back bed said floor cool sick stomach...,Facts
8,1278,testified first time would come bed week altho...,Facts
12,2106,came gone ten day said get something somebody ...,Facts
19,985,knew wrong knew wrong â,Facts


In [None]:
frame_3 = [train_2,unlabel_3]
train_3 = pd.concat(frame_3)
len(train_3)
x_train_3 = tfidf_vect.transform(train_3['Sentence'])
X_train, x_val, Y_train, y_val = train_test_split(x_train_3,train_3['Target'],test_size=0.2,random_state=2)
sgd = SGDClassifier(max_iter=1000, tol=1e-3,loss='hinge',class_weight='balanced')
clf = sgd.fit(X_train, Y_train)
calibrator = CalibratedClassifierCV(clf, cv='prefit')
model_3=calibrator.fit(X_train, Y_train)
pred_sgd = model_3.predict(x_val)
print('Accuracy %s' % accuracy_score(pred_sgd,y_val))
print(classification_report(y_val,pred_sgd))

Accuracy 0.642504118616145
                  precision    recall  f1-score   support

        Analysis       0.53      0.39      0.45       101
      Conclusion       0.53      0.37      0.43        27
           Facts       0.71      0.88      0.79       302
         Invalid       0.52      0.56      0.54       104
           Issue       0.50      0.20      0.28        41
Rule/Law/Holding       0.62      0.25      0.36        32

        accuracy                           0.64       607
       macro avg       0.57      0.44      0.47       607
    weighted avg       0.62      0.64      0.62       607



In [None]:
x_un4 = tfidf_vect.transform(unlabel_4['Sentence'])
pred_unlabel_4 = model_3.predict_proba(x_un4)
pred_unlabel_4

array([[6.56988519e-04, 2.99152708e-05, 6.56024593e-02, 9.30378667e-01,
        3.14569320e-04, 3.01740081e-03],
       [2.07697371e-01, 8.48454979e-04, 2.44897733e-02, 7.64117337e-01,
        2.72588132e-03, 1.21183172e-04],
       [9.72196282e-01, 1.98535513e-04, 3.22388204e-04, 2.67403788e-02,
        1.03646506e-05, 5.32051113e-04],
       [2.14521021e-02, 1.01757599e-02, 9.57481903e-03, 9.58671072e-01,
        2.72417132e-05, 9.90050970e-05],
       [2.20192694e-07, 1.95171514e-03, 9.95107742e-01, 6.36897512e-04,
        2.27772721e-03, 2.56983649e-05],
       [4.73977928e-03, 8.60934393e-04, 2.80198091e-01, 1.43481850e-01,
        3.14662737e-01, 2.56056608e-01],
       [3.00907396e-03, 4.54195492e-04, 3.94730073e-03, 2.75109736e-03,
        4.23776623e-03, 9.85600566e-01],
       [5.31779563e-04, 1.82134342e-04, 9.69739558e-01, 2.94549306e-02,
        1.84778884e-06, 8.97494449e-05],
       [1.45201091e-03, 2.77054123e-05, 9.67111746e-01, 3.11810546e-02,
        2.05136528e-04, 

In [None]:
pos=[]
large=[]
ind = []
i=0
for j in pred_unlabel_4:
  if max(j)> 0.98:
    ind.append(np.argmax(j))
    large.append(max(j))
    pos.append(i)
  i+=1


print(ind)
print(large)
print(pos)
print(len(ind))
print(len(large))
print(len(pos))

[2, 5, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 0, 2, 2, 3, 2, 3, 0, 2, 2, 2, 2, 2]
[0.9951077415886511, 0.9856005662262424, 0.9996716461360757, 0.9847895700917801, 0.9986866471861087, 0.9962364793793735, 0.9850209411810147, 0.9831008649841181, 0.9986701840288749, 0.9903596617508807, 0.985928112788697, 0.9957083275814085, 0.9859092419087077, 0.9881517733790415, 0.9957994001448696, 0.9884029347142943, 0.994921492299187, 0.9822585331557668, 0.9968423409218347, 0.986246403211791, 0.9948043099979872, 0.9875420672935075, 0.9972636583664112, 0.9977843090823967, 0.9917056698780886, 0.9995160533733003, 0.9836771345621451, 0.9960349897687263]
[4, 6, 10, 12, 15, 18, 22, 28, 29, 31, 32, 34, 37, 38, 39, 42, 46, 50, 56, 58, 64, 66, 67, 70, 72, 75, 97, 99]
28
28
28


In [None]:
unlabel_4 = unlabel_4.loc[[4, 6, 10, 12, 15, 18, 22, 28, 29, 31, 32, 34, 37, 38, 39, 42, 46, 50, 56, 58, 64, 66, 67, 70, 72, 75, 97, 99],:]
unlabel_4.head()

Unnamed: 0.1,Unnamed: 0,Sentence,Target
4,2757,july 8 1960 order entered revoking said probat...,Facts
6,2333,offense burglary two prior conviction felony l...,Issue
10,1204,crossexamination captain johnson testified wen...,Facts
12,406,unnecessarily restrictive beyond reasonable re...,Issue
15,2589,appellant arrested following day two three day...,Facts


In [None]:
frame_4 = [train_3,unlabel_4]
train_4 = pd.concat(frame_4)
len(train_4)
x_train_4 = tfidf_vect.transform(train_4['Sentence'])
X_train, x_val, Y_train, y_val = train_test_split(x_train_4,train_4['Target'],test_size=0.2,random_state=42)
sgd = SGDClassifier(max_iter=1000, tol=1e-3,loss='hinge',class_weight='balanced')
clf = sgd.fit(X_train, Y_train)
calibrator = CalibratedClassifierCV(clf, cv='prefit')
model_4=calibrator.fit(X_train, Y_train)
pred_sgd = model_4.predict(x_val)
print('Accuracy %s' % accuracy_score(pred_sgd,y_val))
print(classification_report(y_val,pred_sgd))

Accuracy 0.6606851549755302
                  precision    recall  f1-score   support

        Analysis       0.54      0.39      0.45        85
      Conclusion       0.62      0.37      0.46        35
           Facts       0.73      0.92      0.81       325
         Invalid       0.47      0.44      0.45        96
           Issue       0.62      0.35      0.45        37
Rule/Law/Holding       0.50      0.17      0.26        35

        accuracy                           0.66       613
       macro avg       0.58      0.44      0.48       613
    weighted avg       0.64      0.66      0.63       613



In [None]:
x_un5 = tfidf_vect.transform(unlabel_5['Sentence'])

pred_unlabel_5 = model_4.predict_proba(x_un5)
pred_unlabel_5


array([[2.39650943e-01, 7.48035310e-01, 1.10620400e-04, 1.04483412e-02,
        1.22811998e-03, 5.26665410e-04],
       [8.46735815e-01, 4.88843357e-03, 5.81539255e-02, 4.63718771e-02,
        8.21393038e-05, 4.37678098e-02],
       [4.69537033e-03, 3.51993318e-04, 4.95251826e-01, 6.47771466e-02,
        4.33796085e-01, 1.12757881e-03],
       [2.25131549e-03, 1.08339290e-04, 9.95412810e-01, 2.16953817e-03,
        5.37037809e-05, 4.29332204e-06],
       [2.51910762e-01, 9.32543291e-05, 7.14406643e-01, 3.35820301e-02,
        6.32896769e-06, 9.82167740e-07],
       [3.30058652e-03, 2.10405090e-04, 7.36079739e-01, 2.60346780e-01,
        1.01951225e-05, 5.22940983e-05],
       [2.42686531e-01, 6.75136582e-02, 7.07644615e-03, 9.84193830e-03,
        6.65014792e-01, 7.86663417e-03],
       [2.51268873e-03, 8.80610585e-05, 9.97115123e-01, 1.94848926e-04,
        5.08760195e-05, 3.84025592e-05],
       [5.37168140e-04, 1.32601102e-04, 5.04135642e-02, 9.27734075e-01,
        6.29800976e-05, 

In [None]:
pos=[]
large=[]
ind = []
i=0
for j in pred_unlabel_5:
  if max(j)> 0.98:
    ind.append(np.argmax(j))
    large.append(max(j))
    pos.append(i)
  i+=1


print(ind)
print(large)
print(pos)
print(len(ind))
print(len(large))
print(len(pos))

[2, 2, 3, 2, 2, 2, 2, 1, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 3, 0, 0]
[0.9954128099505486, 0.9971151227090946, 0.9935215245500303, 0.9811387839710393, 0.986313769340299, 0.9963926340974268, 0.995169083820218, 0.9859333580260976, 0.9977739835534862, 0.9990423368197966, 0.9812803670844139, 0.9947363986211027, 0.9995251648478767, 0.9859333580260976, 0.9985134356343949, 0.9845562446912134, 0.9811330935275482, 0.9997851456385038, 0.9841196521512124, 0.9969794560187277, 0.9950222276472876, 0.999262018373968, 0.9995740842609027, 0.9937584728569153, 0.9899498503183384, 0.9946968421805346, 0.9976258479911546, 0.9874751983156796, 0.989431920754817]
[3, 7, 19, 20, 23, 26, 27, 31, 32, 36, 50, 53, 60, 64, 65, 73, 77, 82, 85, 86, 91, 94, 95, 97, 99, 104, 107, 110, 113]
29
29
29


In [None]:
unlabel_5 = unlabel_5.loc[[3, 7, 19, 20, 23, 26, 27, 31, 32, 36, 50, 53, 60, 64, 65, 73, 77, 82, 85, 86, 91, 94, 95, 97, 99, 104, 107, 110, 113],:]
unlabel_5.head()

Unnamed: 0.1,Unnamed: 0,Sentence,Target
3,658,appellant needle mark inside left elbow,Facts
7,2005,âœall male person habitually associate prostit...,Invalid
19,1034,stateâs brief cited massey v,Invalid
20,808,cheto collapsed sidewalkâ,Facts
23,2673,testified waited five minute proceeded directi...,Facts


In [None]:
frame_5 = [train_4,unlabel_5]
train_5 = pd.concat(frame_5)
len(train_5)
x_train_5 = tfidf_vect.transform(train_5['Sentence'])
X_train, x_val, Y_train, y_val = train_test_split(x_train_5,train_5['Target'],test_size=0.2,random_state=42)
sgd = SGDClassifier(max_iter=1000, tol=1e-3,loss='hinge',class_weight='balanced')
clf = sgd.fit(X_train, Y_train)
calibrator = CalibratedClassifierCV(clf, cv='prefit')
model_5=calibrator.fit(X_train, Y_train)
pred_sgd = model_5.predict(x_val)
print('Accuracy %s' % accuracy_score(pred_sgd,y_val))
print(classification_report(y_val,pred_sgd))

Accuracy 0.6655896607431341
                  precision    recall  f1-score   support

        Analysis       0.49      0.36      0.42        91
      Conclusion       0.86      0.49      0.62        37
           Facts       0.74      0.88      0.81       339
         Invalid       0.41      0.46      0.44        80
           Issue       0.72      0.51      0.60        35
Rule/Law/Holding       0.54      0.19      0.28        37

        accuracy                           0.67       619
       macro avg       0.63      0.48      0.53       619
    weighted avg       0.66      0.67      0.65       619



In [None]:
t_p = tfidf_vect.transform(test['Sentence'])
test_pred = model_5.predict(t_p)
print('Accuracy %s' % accuracy_score(test_pred,test['Target']))

Accuracy 0.6218809980806143
