<a href="https://colab.research.google.com/github/sundarp17/semi-supervised/blob/main/high_kappa_svc.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
import pandas as pd

In [3]:
high_kappa = pd.read_csv(r'/content/train_data_highkappa.csv')


test = pd.read_csv(r'/content/test_data.csv')

In [4]:
high_kappa['Target'] = high_kappa['Target'].replace(['Others'],'Invalid')


In [5]:
#cleaning
import nltk
import re
import string
nltk.download('stopwords')
nltk.download('wordnet')
stopword=nltk.corpus.stopwords.words('english')
from nltk.stem import WordNetLemmatizer
wl= WordNetLemmatizer()

def clean_text(text):
  text="".join([word.lower() for word in text if word not in string.punctuation])
  tokens = re.split('\W+',text)
  text = [wl.lemmatize(word) for word in tokens if word not in stopword]
  return text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(analyzer = clean_text)
high_kappa_X_tfidf = tfidf_vect.fit_transform(high_kappa['Sentence'])


In [7]:
test['Target']=test['Target'].replace(['Others'],'Invalid')
test['Sentence'] = test['Sentence'].apply(lambda x: " ".join(x.lower() for x in str(x).split()))
test['Sentence'] = test['Sentence'].str.replace('[^\w\s]','')
from nltk.corpus import stopwords
words = stopwords.words('english')
test['Sentence'] = test['Sentence'].apply(lambda x: " ".join(x for x in x.split() if x not in words))

In [8]:
unlabel_1 = test.loc[:100]
print("length of unlabel_1",len(unlabel_1))
unlabel_2 = test.loc[101:200]
print("length of unlabel_2",len(unlabel_2))
unlabel_3 = test.loc[201:300]
print("length of unlabel_3",len(unlabel_3))
unlabel_4 = test.loc[301:400]
print("length of unlabel_4",len(unlabel_4))
unlabel_5 = test.loc[401:]
print("length of unlabel_5",len(unlabel_5))

length of unlabel_1 101
length of unlabel_2 100
length of unlabel_3 100
length of unlabel_4 100
length of unlabel_5 120


In [9]:
unlabel_2.reset_index(inplace=True)
del unlabel_2['index']
unlabel_2.head()

Unnamed: 0.1,Unnamed: 0,Sentence,Target
0,1207,examination occurred night middle may testifie...,Invalid
1,3400,undisputed testimony reflects appellant drivin...,Facts
2,2072,appellant testify offer evidence behalf,Facts
3,517,56 uniform act regulating traffic highways gis...,Rule/Law/Holding
4,906,agreed accompany gave pawn ticket money reclai...,Facts


In [10]:
unlabel_3.reset_index(inplace=True)
del unlabel_3['index']
unlabel_3.head()

Unnamed: 0.1,Unnamed: 0,Sentence,Target
0,1882,find evidence amply sufficient sustain juryâs ...,Analysis
1,964,actions show thatâ,Invalid
2,1417,certification attending physician shown filed ...,Facts
3,2932,testified agent bland first left automobile wa...,Facts
4,1584,officer testified could smell odor alcohol app...,Facts


In [11]:
unlabel_4.reset_index(inplace=True)
del unlabel_4['index']
unlabel_4.head()

Unnamed: 0.1,Unnamed: 0,Sentence,Target
0,266,section 6 pl,Invalid
1,3469,amended answer appellant alleged bond invalid ...,Analysis
2,1471,evidence sufficient support conviction appella...,Analysis
3,1990,cases cited note 19,Invalid
4,2757,july 8 1960 order entered revoking said probat...,Facts


In [12]:
unlabel_5.reset_index(inplace=True)
del unlabel_5['index']
unlabel_5.head()

Unnamed: 0.1,Unnamed: 0,Sentence,Target
0,2979,overrule appellantâs contention court erred pe...,Analysis
1,2135,state regarded authority case,Invalid
2,1591,testimony adduced appellant mother lillie mae ...,Facts
3,658,appellant needle marks inside left elbow,Facts
4,2562,analysis contents 3 capsules chemist kenneth a...,Facts


In [13]:
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.calibration import CalibratedClassifierCV


X_train, x_val, Y_train, y_val = train_test_split(high_kappa_X_tfidf,high_kappa['Target'],test_size=0.26,random_state=42)
support = svm.LinearSVC(multi_class='ovr',class_weight='balanced')
clf = support.fit(X_train, Y_train)
calibrator = CalibratedClassifierCV(clf, cv='prefit')
model=calibrator.fit(X_train, Y_train)
pred_svm = model.predict(x_val)
print('Accuracy %s' % accuracy_score(pred_svm,y_val))
print(classification_report(y_val,pred_svm))

Accuracy 0.6861979166666666
                  precision    recall  f1-score   support

        Analysis       0.53      0.49      0.51       104
      Conclusion       0.68      0.46      0.55        41
           Facts       0.77      0.91      0.83       416
         Invalid       0.52      0.45      0.48       109
           Issue       0.63      0.35      0.45        55
Rule/Law/Holding       0.41      0.28      0.33        43

        accuracy                           0.69       768
       macro avg       0.59      0.49      0.53       768
    weighted avg       0.67      0.69      0.67       768



In [14]:
x_un1 = tfidf_vect.transform(unlabel_1['Sentence'])
pred_unlabel_1 = model.predict_proba(x_un1)

In [15]:
import numpy as np
pos=[]
large=[]
ind = []
i=0
for j in pred_unlabel_1:
  if max(j)> 0.98:
    ind.append(np.argmax(j))
    large.append(max(j))
    pos.append(i)
  i+=1


print(ind)
print(large)
print(pos)
print(len(ind))
print(len(large))
print(len(pos))

[2, 2, 2, 0, 2, 2, 2, 2, 2, 3, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 3]
[0.9976762305673698, 0.9973674622774974, 0.9992761793245148, 0.9847049605451148, 0.9970842734202897, 0.9992562641337511, 0.9983159229241089, 0.99741552547495, 0.9949124109430324, 0.980244075203021, 0.9954707704640529, 0.9998238817394584, 0.9940097764505761, 0.9915800921364073, 0.9970356332674796, 0.9931711985946821, 0.9986043856026221, 0.9991594396993941, 0.9978614407653355, 0.9997686617609846, 0.9921972987563068, 0.9931984051533534, 0.9974152696151873, 0.9997804617679371, 0.9990233341161356, 0.9967047103354286, 0.989684479954441, 0.9895244727177135, 0.9966512748801728, 0.9906131297409883, 0.9825836769792019, 0.9997697408782279, 0.9894535946640746, 0.9915585287054423, 0.9913258504855246, 0.9987776682892481, 0.9935847567925917, 0.9989465513338849, 0.9944827968421007]
[0, 4, 5, 10, 11, 12, 17, 18, 23, 26, 29, 31, 32, 33, 34, 35, 39, 40, 45, 48, 50, 51, 55, 56, 62, 65, 68,

In [16]:
unlabel_1=unlabel_1.loc[[0, 4, 5, 10, 11, 12, 17, 18, 23, 26, 29, 31, 32, 33, 34, 35, 39, 40, 45, 48, 50, 51, 55, 56, 62, 65, 68, 69, 70, 74, 75, 76, 77, 87, 89, 94, 95, 96, 98],:]
unlabel_1.head()

Unnamed: 0.1,Unnamed: 0,Sentence,Target
0,2173,darden sr testified âœi saw staggering â prett...,Facts
4,79,eligible leaves renew contacts family,Facts
5,1955,got downstairs ponce went car wife left home,Facts
10,2262,first says evidence support charge comment upo...,Issue
11,1604,police notified deceased found lying mrs moers...,Facts


In [17]:
frame_1 = [high_kappa,unlabel_1]
train_1 = pd.concat(frame_1)
len(train_1)

2990

In [18]:
x_train_1 = tfidf_vect.transform(train_1['Sentence'])
x_train_1.shape
X_train, x_val, Y_train, y_val = train_test_split(x_train_1,train_1['Target'],test_size=0.2,random_state=42)
support = svm.LinearSVC(multi_class='ovr',class_weight='balanced')
clf = support.fit(X_train, Y_train)
calibrator = CalibratedClassifierCV(clf, cv='prefit')
model_1=calibrator.fit(X_train, Y_train)
pred_svm = model_1.predict(x_val)
print('Accuracy %s' % accuracy_score(pred_svm,y_val))
print(classification_report(y_val,pred_svm))

Accuracy 0.6822742474916388
                  precision    recall  f1-score   support

        Analysis       0.58      0.49      0.53        86
      Conclusion       0.65      0.35      0.46        31
           Facts       0.75      0.88      0.81       324
         Invalid       0.51      0.52      0.51        87
           Issue       0.67      0.46      0.54        35
Rule/Law/Holding       0.53      0.26      0.35        35

        accuracy                           0.68       598
       macro avg       0.61      0.49      0.53       598
    weighted avg       0.67      0.68      0.67       598



In [19]:
x_un2 = tfidf_vect.transform(unlabel_2['Sentence'])
pred_unlabel_2 = model_1.predict_proba(x_un2)
pos=[]
large=[]
ind = []
i=0
for j in pred_unlabel_2:
  if max(j)> 0.98:
    ind.append(np.argmax(j))
    large.append(max(j))
    pos.append(i)
  i+=1


print(ind)
print(large)
print(pos)
print(len(ind))
print(len(large))
print(len(pos))

[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 3, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2]
[0.9988148455356204, 0.9955889632256993, 0.9938935444930089, 0.9991735635800268, 0.998642765379913, 0.9998215485119392, 0.9893765418782783, 0.9858773146600566, 0.9996696826584589, 0.9959119945850989, 0.9889173926942028, 0.9970122783361617, 0.9998341453410665, 0.9983731101022827, 0.9999316699098572, 0.997732391564297, 0.998812621715285, 0.9942791326203615, 0.9913927180462847, 0.999688015488552, 0.9982861692141178, 0.999774243044244, 0.9985938719895736, 0.9995436942251603, 0.987944400909683, 0.9920877950675142, 0.9809668309012182, 0.9965159428937383, 0.9908526038859935, 0.9913209412772077, 0.9997883055869916, 0.9984860352433985, 0.9954314393608144]
[2, 4, 6, 7, 8, 12, 20, 21, 29, 32, 34, 36, 37, 39, 42, 44, 49, 50, 55, 59, 63, 64, 69, 70, 71, 75, 79, 83, 85, 86, 88, 91, 99]
33
33
33


In [20]:
unlabel_2 = unlabel_2.loc[[2, 4, 6, 7, 8, 12, 20, 21, 29, 32, 34, 36, 37, 39, 42, 44, 49, 50, 55, 59, 63, 64, 69, 70, 71, 75, 79, 83, 85, 86, 88, 91, 99 ],:]
unlabel_2.head()

Unnamed: 0.1,Unnamed: 0,Sentence,Target
2,2072,appellant testify offer evidence behalf,Facts
4,906,agreed accompany gave pawn ticket money reclai...,Facts
6,570,appellant performed work asked 7500 money ther...,Facts
7,3150,finding door facing street seen man walking pr...,Facts
8,710,appellant indicted robbery indictment alleging...,Analysis


In [21]:
frame_2 = [train_1,unlabel_2]
train_2 = pd.concat(frame_2)
len(train_2)

3023

In [22]:
x_train_2 = tfidf_vect.transform(train_2['Sentence'])
X_train, x_val, Y_train, y_val = train_test_split(x_train_2,train_2['Target'],test_size=0.27,random_state=2)
support = svm.LinearSVC(multi_class='ovr',class_weight='balanced')
clf = support.fit(X_train, Y_train)
calibrator = CalibratedClassifierCV(clf, cv='prefit')
model_2=calibrator.fit(X_train, Y_train)
pred_svm = model_2.predict(x_val)
print('Accuracy %s' % accuracy_score(pred_svm,y_val))
print(classification_report(y_val,pred_svm))

Accuracy 0.6511627906976745
                  precision    recall  f1-score   support

        Analysis       0.46      0.40      0.43       118
      Conclusion       0.52      0.38      0.44        45
           Facts       0.73      0.91      0.81       418
         Invalid       0.53      0.43      0.47       140
           Issue       0.72      0.34      0.46        53
Rule/Law/Holding       0.45      0.23      0.31        43

        accuracy                           0.65       817
       macro avg       0.57      0.45      0.49       817
    weighted avg       0.63      0.65      0.63       817



In [23]:
x_un3 = tfidf_vect.transform(unlabel_3['Sentence'])
pred_unlabel_3 = model_2.predict_proba(x_un3)

pos=[]
large=[]
ind = []
i=0
for j in pred_unlabel_3:
  if max(j)> 0.98:
    ind.append(np.argmax(j))
    large.append(max(j))
    pos.append(i)
  i+=1


print(ind)
print(large)
print(pos)
print(len(ind))
print(len(large))
print(len(pos))

[0, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 3, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
[0.9939456819277874, 0.9975414877870368, 0.9974951233059602, 0.992877453246, 0.9993131536144153, 0.9995158083952358, 0.9824330485485151, 0.9996225757008086, 0.9857854780477302, 0.9995389356319249, 0.9894785934532445, 0.9944885256334962, 0.9998818780594918, 0.9966580562553162, 0.999391133864347, 0.9917371352860366, 0.9994821064491963, 0.9995521015539321, 0.9840107650376706, 0.9995464282539737, 0.9995264431045985, 0.9972349688370145, 0.9985121292913292, 0.9978834384479829, 0.994673085047678, 0.9931225421029054, 0.9992019441566335, 0.9951274296191157, 0.9953030889398501, 0.9920615665317145, 0.9998449529792585, 0.9900645268733347, 0.9997968812226944, 0.9975632529284784, 0.9982862842543676, 0.9995441512642388, 0.9997099314000429, 0.9871292550439659, 0.9978526939087082, 0.9979510876400934, 0.9998593533199522, 0.9924760358394228, 0.985342506680777]
[0, 2, 3, 4, 7, 

In [24]:
unlabel_3=unlabel_3.loc[[0, 2, 3, 4, 7, 8, 11, 12, 17, 20, 24, 26, 27, 29, 30, 34, 35, 36, 40, 43, 46, 47, 48, 49, 51, 52, 53, 54, 56, 57, 63, 64, 65, 66, 67, 68, 72, 73, 75, 79, 89, 91, 97],:]
frame_3 = [train_2,unlabel_3]
train_3 = pd.concat(frame_3)

x_train_3 = tfidf_vect.transform(train_3['Sentence'])
X_train, x_val, Y_train, y_val = train_test_split(x_train_3,train_3['Target'],test_size=0.2,random_state=42)
support = svm.LinearSVC(multi_class='ovr',class_weight='balanced')
clf = support.fit(X_train, Y_train)
calibrator = CalibratedClassifierCV(clf, cv='prefit')
model_3=calibrator.fit(X_train, Y_train)
pred_svm = model_3.predict(x_val)
print('Accuracy %s' % accuracy_score(pred_svm,y_val))
print(classification_report(y_val,pred_svm))

Accuracy 0.6677524429967426
                  precision    recall  f1-score   support

        Analysis       0.55      0.38      0.45        86
      Conclusion       0.70      0.41      0.52        39
           Facts       0.73      0.92      0.81       320
         Invalid       0.54      0.44      0.48        98
           Issue       0.54      0.41      0.47        34
Rule/Law/Holding       0.47      0.24      0.32        37

        accuracy                           0.67       614
       macro avg       0.59      0.47      0.51       614
    weighted avg       0.64      0.67      0.64       614



In [25]:
x_un4 = tfidf_vect.transform(unlabel_4['Sentence'])
pred_unlabel_4 = model_3.predict_proba(x_un4)


pos=[]
large=[]
ind = []
i=0
for j in pred_unlabel_4:
  if max(j)> 0.98:
    ind.append(np.argmax(j))
    large.append(max(j))
    pos.append(i)
  i+=1


print(ind)
print(large)
print(pos)
print(len(ind))
print(len(large))
print(len(pos))

[0, 3, 2, 5, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 0, 2, 2, 0, 2, 2, 3, 2, 2, 2, 3, 0, 2, 2, 2, 2, 2, 2, 2, 2, 5, 2, 2]
[0.997943446763141, 0.9852843116217754, 0.9988769393725013, 0.9907917607503529, 0.99789301655246, 0.995473396597045, 0.9999231717681648, 0.9899169848060247, 0.9824567259047031, 0.999487405018733, 0.9995586692961672, 0.9802546073596572, 0.9970555116770976, 0.9986894931260931, 0.9962755523384537, 0.9961022679378039, 0.9995647166270725, 0.9828788660256775, 0.9970284706070329, 0.9992326668230959, 0.9997263281105137, 0.9986810582529743, 0.9944509451387826, 0.9978175004172027, 0.9996506116129149, 0.9978978873575383, 0.9875356957525606, 0.9976918950842669, 0.9988768554872981, 0.9856549437928452, 0.9965539432682191, 0.9994905980070073, 0.9986718916660761, 0.9891232330028129, 0.9835791979836034, 0.9876382129421701, 0.9940498541643288, 0.9970535359028991, 0.9865775550894007, 0.9916665153598218, 0.9961503829865389, 0.9906860398535288, 0.99996650877616

In [26]:
unlabel_4=unlabel_4.loc[[2, 3, 4, 6, 7, 8, 10, 11, 12, 15, 18, 20, 21, 22, 27, 28, 29, 30, 31, 32, 34, 37, 38, 39, 40, 42, 46, 49, 50, 54, 55, 56, 58, 60, 61, 64, 66, 67, 69, 70, 71, 72, 75, 85, 87, 93, 94, 97, 99],:]

In [27]:
frame_4 = [train_3,unlabel_4]
train_4 = pd.concat(frame_4)
len(train_4)
x_train_4 = tfidf_vect.transform(train_4['Sentence'])
X_train, x_val, Y_train, y_val = train_test_split(x_train_4,train_4['Target'],test_size=0.27,random_state=42)
support = svm.LinearSVC()
clf = support.fit(X_train, Y_train)
calibrator = CalibratedClassifierCV(clf, cv='prefit')
model_4=calibrator.fit(X_train, Y_train)
pred_svm = model_4.predict(x_val)
print('Accuracy %s' % accuracy_score(pred_svm,y_val))
print(classification_report(y_val,pred_svm))

Accuracy 0.6520190023752969
                  precision    recall  f1-score   support

        Analysis       0.45      0.41      0.43       109
      Conclusion       0.56      0.42      0.48        43
           Facts       0.77      0.88      0.82       453
         Invalid       0.45      0.43      0.44       128
           Issue       0.46      0.48      0.47        48
Rule/Law/Holding       0.44      0.18      0.26        61

        accuracy                           0.65       842
       macro avg       0.52      0.47      0.48       842
    weighted avg       0.63      0.65      0.63       842



In [28]:
x_un5 = tfidf_vect.transform(unlabel_5['Sentence'])

pred_unlabel_5 = model_4.predict_proba(x_un5)
pos=[]
large=[]
ind = []
i=0
for j in pred_unlabel_5:
  if max(j)> 0.98:
    ind.append(np.argmax(j))
    large.append(max(j))
    pos.append(i)
  i+=1


print(ind)
print(large)
print(pos)
print(len(ind))
print(len(large))
print(len(pos))

[2, 2, 2, 2, 2, 5, 3, 2, 2, 2, 1, 2, 1, 0, 1, 3, 2, 2, 2, 2, 2, 2, 1, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 3, 0, 3]
[0.9992618932006057, 0.999106829178187, 0.9819477363472758, 0.9881782260378613, 0.9822047693029919, 0.9927424469926617, 0.9981105480407588, 0.9953450149485725, 0.998612517254091, 0.9991582746790548, 0.9813434078035382, 0.9956586499757374, 0.9992283062702982, 0.983283503479984, 0.9963156685388107, 0.980263257073298, 0.9915168103788179, 0.9947812879494868, 0.9997507842836444, 0.9987224208487459, 0.9994078310100827, 0.9975708850580389, 0.9813434078035382, 0.999199145807247, 0.9856299271227739, 0.998872172343334, 0.9999292403885802, 0.9855609132069325, 0.9995061423259304, 0.9928920245522888, 0.9881588255147763, 0.9992415859067083, 0.9993626506715114, 0.9972742871171162, 0.9937597655618835, 0.9916947407688811, 0.9968918768843529, 0.9948016304030136, 0.9917925742362721]
[3, 7, 9, 12, 17, 18, 19, 22, 23, 26, 31, 32, 36, 37, 46, 48, 50, 52, 53, 59, 60, 63, 64, 65, 69, 73, 82, 8

In [None]:
unlabel_5 = unlabel_5.loc[[3, 7, 9, 12, 17, 18, 19, 22, 23, 26, 31, 32, 36, 37, 46, 48, 50, 52, 53, 59, 60, 63, 64, 65, 69, 73, 82, 85, 86, 91, 92, 94, 95, 97, 103, 104, 107, 110, 117],:]

In [29]:
frame_5 = [train_4,unlabel_5]
train_5 = pd.concat(frame_5)
len(train_5)
x_train_5 = tfidf_vect.transform(train_5['Sentence'])
X_train, x_val, Y_train, y_val = train_test_split(x_train_5,train_5['Target'],test_size=0.2,random_state=42)
support = svm.LinearSVC()
clf = support.fit(X_train, Y_train)
calibrator = CalibratedClassifierCV(clf, cv='prefit')
model_5=calibrator.fit(X_train, Y_train)
pred_svm = model_5.predict(x_val)
print('Accuracy %s' % accuracy_score(pred_svm,y_val))
print(classification_report(y_val,pred_svm))

Accuracy 0.6692426584234931
                  precision    recall  f1-score   support

        Analysis       0.49      0.44      0.47        93
      Conclusion       0.53      0.59      0.56        32
           Facts       0.79      0.88      0.83       347
         Invalid       0.44      0.43      0.43       101
           Issue       0.57      0.41      0.48        39
Rule/Law/Holding       0.59      0.29      0.38        35

        accuracy                           0.67       647
       macro avg       0.57      0.51      0.53       647
    weighted avg       0.65      0.67      0.66       647



In [30]:
t_p = tfidf_vect.transform(test['Sentence'])
test_pred = model_5.predict(t_p)
print('Accuracy %s' % accuracy_score(test_pred,test['Target']))

Accuracy 0.7293666026871402


In [31]:
from sklearn.metrics import precision_recall_fscore_support
print(precision_recall_fscore_support(test['Target'],test_pred,average='macro'))

(0.706574797286541, 0.6493928589860234, 0.6721828117813521, None)
