<a href="https://colab.research.google.com/github/sundarp17/semi-supervised/blob/main/train9_svc.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd

In [2]:

train9 = pd.read_csv(r'/content/train_data9.csv')
test = pd.read_csv(r'/content/test_data.csv')

In [3]:

train9['Target'] = train9['Target'].replace(['Others'],'Invalid')


In [4]:
train9['Target'].unique()

array(['Invalid', 'Issue', 'Analysis', 'Facts', 'Conclusion',
       'Rule/Law/Holding'], dtype=object)

In [5]:
#cleaning
import nltk
import re
import string
nltk.download('stopwords')
nltk.download('wordnet')
stopword=nltk.corpus.stopwords.words('english')
from nltk.stem import WordNetLemmatizer
wl= WordNetLemmatizer()

def clean_text(text):
  text="".join([word.lower() for word in text if word not in string.punctuation])
  tokens = re.split('\W+',text)
  text = [wl.lemmatize(word) for word in tokens if word not in stopword]
  return text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(analyzer = clean_text)

train9_Xtfidf = tfidf_vect.fit_transform(train9['Sentence'])


In [7]:
test['Target']=test['Target'].replace(['Others'],'Invalid')
test['Sentence'] = test['Sentence'].apply(lambda x: " ".join(x.lower() for x in str(x).split()))
test['Sentence'] = test['Sentence'].str.replace('[^\w\s]','')
from nltk.corpus import stopwords
words = stopwords.words('english')
test['Sentence'] = test['Sentence'].apply(lambda x: " ".join(x for x in x.split() if x not in words))

In [8]:
unlabel_1 = test.loc[:100]
print("length of unlabel_1",len(unlabel_1))
unlabel_2 = test.loc[101:200]
print("length of unlabel_2",len(unlabel_2))
unlabel_3 = test.loc[201:300]
print("length of unlabel_3",len(unlabel_3))
unlabel_4 = test.loc[301:400]
print("length of unlabel_4",len(unlabel_4))
unlabel_5 = test.loc[401:]
print("length of unlabel_5",len(unlabel_5))

length of unlabel_1 101
length of unlabel_2 100
length of unlabel_3 100
length of unlabel_4 100
length of unlabel_5 120


In [9]:
unlabel_2.reset_index(inplace=True)
del unlabel_2['index']
unlabel_2.head()

Unnamed: 0.1,Unnamed: 0,Sentence,Target
0,1207,examination occurred night middle may testifie...,Invalid
1,3400,undisputed testimony reflects appellant drivin...,Facts
2,2072,appellant testify offer evidence behalf,Facts
3,517,56 uniform act regulating traffic highways gis...,Rule/Law/Holding
4,906,agreed accompany gave pawn ticket money reclai...,Facts


In [10]:
unlabel_3.reset_index(inplace=True)
del unlabel_3['index']
unlabel_3.head()

Unnamed: 0.1,Unnamed: 0,Sentence,Target
0,1882,find evidence amply sufficient sustain juryâs ...,Analysis
1,964,actions show thatâ,Invalid
2,1417,certification attending physician shown filed ...,Facts
3,2932,testified agent bland first left automobile wa...,Facts
4,1584,officer testified could smell odor alcohol app...,Facts


In [11]:
unlabel_4.reset_index(inplace=True)
del unlabel_4['index']
unlabel_4.head()

Unnamed: 0.1,Unnamed: 0,Sentence,Target
0,266,section 6 pl,Invalid
1,3469,amended answer appellant alleged bond invalid ...,Analysis
2,1471,evidence sufficient support conviction appella...,Analysis
3,1990,cases cited note 19,Invalid
4,2757,july 8 1960 order entered revoking said probat...,Facts


In [12]:
unlabel_5.reset_index(inplace=True)
del unlabel_5['index']
unlabel_5.head()

Unnamed: 0.1,Unnamed: 0,Sentence,Target
0,2979,overrule appellantâs contention court erred pe...,Analysis
1,2135,state regarded authority case,Invalid
2,1591,testimony adduced appellant mother lillie mae ...,Facts
3,658,appellant needle marks inside left elbow,Facts
4,2562,analysis contents 3 capsules chemist kenneth a...,Facts


In [13]:
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.calibration import CalibratedClassifierCV


X_train, x_val, Y_train, y_val = train_test_split(train9_Xtfidf,train9['Target'],test_size=0.26,random_state=42)
support = svm.LinearSVC(multi_class='ovr',class_weight='balanced')
clf = support.fit(X_train, Y_train)
calibrator = CalibratedClassifierCV(clf, cv='prefit')
model=calibrator.fit(X_train, Y_train)
pred_svm = model.predict(x_val)
print('Accuracy %s' % accuracy_score(pred_svm,y_val))
print(classification_report(y_val,pred_svm))

Accuracy 0.6780973451327433
                  precision    recall  f1-score   support

        Analysis       0.53      0.40      0.46       127
      Conclusion       0.76      0.46      0.57        48
           Facts       0.74      0.92      0.82       476
         Invalid       0.51      0.45      0.48       131
           Issue       0.51      0.44      0.47        52
Rule/Law/Holding       0.69      0.26      0.37        70

        accuracy                           0.68       904
       macro avg       0.62      0.49      0.53       904
    weighted avg       0.66      0.68      0.65       904



In [14]:
x_un1 = tfidf_vect.transform(unlabel_1['Sentence'])
pred_unlabel_1 = model.predict_proba(x_un1)

In [15]:
import numpy as np
pos=[]
large=[]
ind = []
i=0
for j in pred_unlabel_1:
  if max(j)> 0.98:
    ind.append(np.argmax(j))
    large.append(max(j))
    pos.append(i)
  i+=1


print(ind)
print(large)
print(pos)
print(len(ind))
print(len(large))
print(len(pos))

[2, 2, 2, 2, 2, 2, 0, 2, 2, 1, 3, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 3]
[0.9982862876077865, 0.999888709475179, 0.9969887488152119, 0.9981618925156005, 0.9980349961098464, 0.9954912238191457, 0.9882749407112594, 0.9860927059482004, 0.9976728939044757, 0.9986372021510044, 0.9861153030094606, 0.9992633458025939, 0.9997883829812441, 0.982558381263078, 0.991383062687461, 0.9921545204180701, 0.9824118060078745, 0.995412325687642, 0.9980889417016526, 0.9996103140650974, 0.9994103245097187, 0.9915285829702896, 0.9967018493713722, 0.9925541584533835, 0.9954453037476949, 0.9980862866286445, 0.9963479701417108, 0.9945309806161342, 0.9993726580672774, 0.9926767674473652, 0.9885805938519155, 0.9999121520331977, 0.9845036229600803, 0.9928361058004872, 0.9914623292894965, 0.9970129856267721, 0.9994570614559848, 0.9899109875473764, 0.9993896405178684, 0.9972475963293708]
[4, 5, 11, 12, 17, 18, 20, 21, 23, 24, 26, 29, 31, 33, 35, 36, 37, 39, 40, 45, 48,

In [16]:
unlabel_1=unlabel_1.loc[[4, 5, 11, 12, 17, 18, 20, 21, 23, 24, 26, 29, 31, 33, 35, 36, 37, 39, 40, 45, 48, 49, 50, 51, 52, 55, 56, 60, 62, 70, 72, 76, 79, 84, 87, 89, 94, 95, 96, 98],:]
unlabel_1.head()

Unnamed: 0.1,Unnamed: 0,Sentence,Target
4,79,eligible leaves renew contacts family,Facts
5,1955,got downstairs ponce went car wife left home,Facts
11,1604,police notified deceased found lying mrs moers...,Facts
12,1102,shown vine street drug storeâs supply narcotic...,Facts
17,2795,1951 ford came stop knocking several fence pos...,Facts


In [17]:
frame_1 = [train9,unlabel_1]
train9_1 = pd.concat(frame_1)
len(train9_1)

3516

In [18]:
x_train_1 = tfidf_vect.transform(train9_1['Sentence'])
x_train_1.shape
X_train, x_val, Y_train, y_val = train_test_split(x_train_1,train9_1['Target'],test_size=0.2,random_state=42)
support = svm.LinearSVC(multi_class='ovr',class_weight='balanced')
clf = support.fit(X_train, Y_train)
calibrator = CalibratedClassifierCV(clf, cv='prefit')
model_1=calibrator.fit(X_train, Y_train)
pred_svm = model_1.predict(x_val)
print('Accuracy %s' % accuracy_score(pred_svm,y_val))
print(classification_report(y_val,pred_svm))

Accuracy 0.6619318181818182
                  precision    recall  f1-score   support

        Analysis       0.47      0.42      0.44        98
      Conclusion       0.70      0.53      0.60        36
           Facts       0.74      0.89      0.81       361
         Invalid       0.51      0.49      0.50       112
           Issue       0.67      0.47      0.55        38
Rule/Law/Holding       0.56      0.17      0.26        59

        accuracy                           0.66       704
       macro avg       0.61      0.50      0.53       704
    weighted avg       0.64      0.66      0.64       704



In [19]:
x_un2 = tfidf_vect.transform(unlabel_2['Sentence'])
pred_unlabel_2 = model_1.predict_proba(x_un2)
pos=[]
large=[]
ind = []
i=0
for j in pred_unlabel_2:
  if max(j)> 0.98:
    ind.append(np.argmax(j))
    large.append(max(j))
    pos.append(i)
  i+=1


print(ind)
print(large)
print(pos)
print(len(ind))
print(len(large))
print(len(pos))

[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 3, 1, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 2, 2, 2, 2, 5, 2, 2]
[0.9934157998976475, 0.9991494865229922, 0.9907146043400668, 0.9981259628853466, 0.9957714419060029, 0.9973504352805337, 0.9950532778066642, 0.9999685902940576, 0.9835360072900953, 0.9872272994295882, 0.9959880281873696, 0.9943984455471434, 0.9957063383314383, 0.9939565421490797, 0.9997281458798072, 0.9801744635827079, 0.9966868518498457, 0.992878907764031, 0.9992597142367959, 0.9998467532402695, 0.9990144948243601, 0.997899958770332, 0.9978335015321139, 0.9889158977235173, 0.9926839533304003, 0.9997119431027232, 0.9997324061126742, 0.999915155159875, 0.9988934165640806, 0.9991710277657194, 0.9899621066144757, 0.997359889093084, 0.9992700721349381, 0.9851288079514388, 0.994066833013762, 0.9980642919221575, 0.9921461162931687, 0.9991388782889942, 0.9962286176323569, 0.9811027063933447, 0.9832036774035502, 0.986484963149883]
[0, 2, 3, 4, 7, 8, 9, 12, 13, 14, 15, 

In [20]:
unlabel_2 = unlabel_2.loc[[0, 2, 3, 4, 7, 8, 9, 12, 13, 14, 15, 17, 20, 21, 29, 31, 32, 34, 36, 37, 39, 42, 49, 55, 58, 59, 63, 64, 69, 70, 71, 75, 79, 80, 83, 85, 86, 88, 91, 93, 95, 99],:]
unlabel_2.head()

Unnamed: 0.1,Unnamed: 0,Sentence,Target
0,1207,examination occurred night middle may testifie...,Invalid
2,2072,appellant testify offer evidence behalf,Facts
3,517,56 uniform act regulating traffic highways gis...,Rule/Law/Holding
4,906,agreed accompany gave pawn ticket money reclai...,Facts
7,3150,finding door facing street seen man walking pr...,Facts


In [21]:
frame_2 = [train9_1,unlabel_2]
train9_2 = pd.concat(frame_2)
len(train9_2)

3558

In [22]:
x_train_2 = tfidf_vect.transform(train9_2['Sentence'])
X_train, x_val, Y_train, y_val = train_test_split(x_train_2,train9_2['Target'],test_size=0.27,random_state=2)
support = svm.LinearSVC(multi_class='ovr',class_weight='balanced')
clf = support.fit(X_train, Y_train)
calibrator = CalibratedClassifierCV(clf, cv='prefit')
model_2=calibrator.fit(X_train, Y_train)
pred_svm = model_2.predict(x_val)
print('Accuracy %s' % accuracy_score(pred_svm,y_val))
print(classification_report(y_val,pred_svm))

Accuracy 0.6399583766909469
                  precision    recall  f1-score   support

        Analysis       0.45      0.36      0.40       152
      Conclusion       0.79      0.34      0.48        44
           Facts       0.72      0.90      0.80       497
         Invalid       0.45      0.42      0.44       152
           Issue       0.66      0.33      0.44        64
Rule/Law/Holding       0.47      0.29      0.36        52

        accuracy                           0.64       961
       macro avg       0.59      0.44      0.48       961
    weighted avg       0.62      0.64      0.62       961



In [23]:
x_un3 = tfidf_vect.transform(unlabel_3['Sentence'])
pred_unlabel_3 = model_2.predict_proba(x_un3)

pos=[]
large=[]
ind = []
i=0
for j in pred_unlabel_3:
  if max(j)> 0.98:
    ind.append(np.argmax(j))
    large.append(max(j))
    pos.append(i)
  i+=1


print(ind)
print(large)
print(pos)
print(len(ind))
print(len(large))
print(len(pos))

[0, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
[0.9873826779117697, 0.983868187505244, 0.9916693302268067, 0.9991678473405334, 0.9988339559462865, 0.9855375207067237, 0.9997314014506599, 0.9998254258897863, 0.9976274926429954, 0.9923040341170951, 0.9996263628133595, 0.9973002307056609, 0.9807085153990159, 0.988636036672078, 0.9951485180963396, 0.9982847986366201, 0.9851093420065008, 0.9927869338496996, 0.9993208188263748, 0.9981862631067762, 0.9914027086152718, 0.9857072731124102, 0.996543140588619, 0.9921829114995656, 0.9973631123684282, 0.9925433214506207, 0.9991057075781198, 0.9949356473424328, 0.9998425613128787, 0.9908316262611386, 0.9996648853101802, 0.9989108188754517, 0.9996970849566924, 0.995849790916773, 0.9814549481601849, 0.9996416329327904, 0.989448619000912, 0.9971276319225862, 0.9951101294218655]
[0, 3, 4, 7, 8, 9, 12, 20, 24, 26, 27, 30, 32, 33, 35, 36, 41, 42, 43, 46, 47, 48, 51, 52, 54, 56, 63, 64,

In [24]:
unlabel_3=unlabel_3.loc[[0, 3, 4, 7, 8, 9, 12, 20, 24, 26, 27, 30, 32, 33, 35, 36, 41, 42, 43, 46, 47, 48, 51, 52, 54, 56, 63, 64, 65, 67, 68, 70, 72, 75, 79, 89, 91, 96, 97],:]
frame_3 = [train9_2,unlabel_3]
train9_3 = pd.concat(frame_3)

x_train_3 = tfidf_vect.transform(train9_3['Sentence'])
X_train, x_val, Y_train, y_val = train_test_split(x_train_3,train9_3['Target'],test_size=0.2,random_state=42)
support = svm.LinearSVC(multi_class='ovr',class_weight='balanced')
clf = support.fit(X_train, Y_train)
calibrator = CalibratedClassifierCV(clf, cv='prefit')
model_3=calibrator.fit(X_train, Y_train)
pred_svm = model_3.predict(x_val)
print('Accuracy %s' % accuracy_score(pred_svm,y_val))
print(classification_report(y_val,pred_svm))

Accuracy 0.6666666666666666
                  precision    recall  f1-score   support

        Analysis       0.47      0.46      0.47        99
      Conclusion       0.72      0.50      0.59        46
           Facts       0.77      0.90      0.83       377
         Invalid       0.48      0.41      0.44       108
           Issue       0.56      0.47      0.51        43
Rule/Law/Holding       0.38      0.19      0.25        47

        accuracy                           0.67       720
       macro avg       0.56      0.49      0.51       720
    weighted avg       0.64      0.67      0.65       720



In [25]:
x_un4 = tfidf_vect.transform(unlabel_4['Sentence'])
pred_unlabel_4 = model_3.predict_proba(x_un4)


pos=[]
large=[]
ind = []
i=0
for j in pred_unlabel_4:
  if max(j)> 0.98:
    ind.append(np.argmax(j))
    large.append(max(j))
    pos.append(i)
  i+=1


print(ind)
print(large)
print(pos)
print(len(ind))
print(len(large))
print(len(pos))

[0, 2, 5, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 0, 0, 2, 2, 2, 2, 3, 2, 2, 3, 0, 2, 2, 2, 2, 0, 2, 2, 2, 2, 5, 2, 2]
[0.9997166197838472, 0.98926718086285, 0.9820628250700116, 0.9982486402343856, 0.9995709193468206, 0.9879211300993875, 0.9995203746026369, 0.9996835986577, 0.9933207960588835, 0.9969098710692684, 0.996273319566354, 0.998380330222025, 0.9986860987412465, 0.9950282040816957, 0.9996367771439042, 0.9991309824870448, 0.9992621784183258, 0.9957959521057225, 0.9997339000156499, 0.9962361020459193, 0.9872149590551894, 0.9954167566154918, 0.9980812811030908, 0.9940779178984175, 0.9990167486046996, 0.9969746147960156, 0.9829122159005255, 0.9879237084173713, 0.9908300011823186, 0.9990556976355948, 0.9943119630576421, 0.9992278752218278, 0.9984905798025994, 0.9999688633504827, 0.9854748705211666, 0.9926175865242387, 0.9967432761318213, 0.9979692011764456, 0.9808510647952198, 0.9822565805572081, 0.9866297339778362, 0.9963194020217845]
[2, 4, 6, 7, 10, 13, 15, 18, 21, 22, 27

In [26]:
unlabel_4=unlabel_4.loc[[2, 4, 6, 7, 10, 13, 15, 18, 21, 22, 27, 28, 29, 30, 32, 34, 37, 38, 40, 46, 48, 49, 50, 55, 56, 58, 60, 61, 66, 67, 69, 70, 71, 75, 76, 82, 85, 87, 93, 94, 97, 99],:]

In [27]:
frame_4 = [train9_3,unlabel_4]
train9_4 = pd.concat(frame_4)
len(train9_4)
x_train_4 = tfidf_vect.transform(train9_4['Sentence'])
X_train, x_val, Y_train, y_val = train_test_split(x_train_4,train9_4['Target'],test_size=0.27,random_state=42)
support = svm.LinearSVC()
clf = support.fit(X_train, Y_train)
calibrator = CalibratedClassifierCV(clf, cv='prefit')
model_4=calibrator.fit(X_train, Y_train)
pred_svm = model_4.predict(x_val)
print('Accuracy %s' % accuracy_score(pred_svm,y_val))
print(classification_report(y_val,pred_svm))

Accuracy 0.6581892166836215
                  precision    recall  f1-score   support

        Analysis       0.44      0.42      0.43       137
      Conclusion       0.59      0.51      0.55        45
           Facts       0.78      0.87      0.82       527
         Invalid       0.48      0.44      0.46       153
           Issue       0.42      0.32      0.36        57
Rule/Law/Holding       0.53      0.39      0.45        64

        accuracy                           0.66       983
       macro avg       0.54      0.49      0.51       983
    weighted avg       0.64      0.66      0.65       983



In [28]:
x_un5 = tfidf_vect.transform(unlabel_5['Sentence'])

pred_unlabel_5 = model_4.predict_proba(x_un5)
pos=[]
large=[]
ind = []
i=0
for j in pred_unlabel_5:
  if max(j)> 0.98:
    ind.append(np.argmax(j))
    large.append(max(j))
    pos.append(i)
  i+=1


print(ind)
print(large)
print(pos)
print(len(ind))
print(len(large))
print(len(pos))

[1, 2, 2, 3, 2, 1, 2, 0, 5, 3, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 0, 3, 3, 0, 0, 0]
[0.988642450866154, 0.9989607972200839, 0.9958960315831009, 0.9941114016226456, 0.9836061478280091, 0.9937027160595004, 0.997447228346675, 0.9897171699156364, 0.9948489045534549, 0.9895712728708203, 0.9853865266357221, 0.9959267994258731, 0.9993623376224963, 0.9996405949438708, 0.9942441330920677, 0.9861113838280503, 0.9849270258195855, 0.9998455222697711, 0.9958843296268052, 0.9990211598875591, 0.9994851979241426, 0.9859867437749067, 0.9978543277639295, 0.9983085574757223, 0.9980746062375948, 0.9929690148395756, 0.9931201212986439, 0.999917124856048, 0.9981785661697492, 0.9838459892018341, 0.9995010474629901, 0.998763243021456, 0.9889009056577867, 0.9930170640720455, 0.9918843560629877, 0.9857122436814089, 0.9962197547487853, 0.9957135798162706, 0.9897924725902356, 0.9894956759826435]
[0, 3, 7, 8, 9, 10, 12, 16, 18, 19, 20, 22, 23, 26, 27, 30, 32, 36, 46, 52, 53, 56

In [29]:
unlabel_5 = unlabel_5.loc[[0, 3, 7, 8, 9, 10, 12, 16, 18, 19, 20, 22, 23, 26, 27, 30, 32, 36, 46, 52, 53, 56, 60, 63, 65, 67, 75, 82, 86, 91, 94, 95, 97, 100, 104, 106, 107, 110, 112, 114],:]

In [30]:
frame_5 = [train9_4,unlabel_5]
train9_5 = pd.concat(frame_5)
len(train9_5)
x_train_5 = tfidf_vect.transform(train9_5['Sentence'])
X_train, x_val, Y_train, y_val = train_test_split(x_train_5,train9_5['Target'],test_size=0.27,random_state=42)
support = svm.LinearSVC()
clf = support.fit(X_train, Y_train)
calibrator = CalibratedClassifierCV(clf, cv='prefit')
model_5=calibrator.fit(X_train, Y_train)
pred_svm = model_5.predict(x_val)
print('Accuracy %s' % accuracy_score(pred_svm,y_val))
print(classification_report(y_val,pred_svm))

Accuracy 0.6750503018108652
                  precision    recall  f1-score   support

        Analysis       0.44      0.42      0.43       146
      Conclusion       0.64      0.43      0.51        54
           Facts       0.81      0.87      0.84       522
         Invalid       0.50      0.56      0.53       141
           Issue       0.59      0.51      0.55        59
Rule/Law/Holding       0.48      0.29      0.36        72

        accuracy                           0.68       994
       macro avg       0.58      0.51      0.54       994
    weighted avg       0.66      0.68      0.67       994



In [31]:
t_p = tfidf_vect.transform(test['Sentence'])
test_pred = model_5.predict(t_p)
print('Accuracy %s' % accuracy_score(test_pred,test['Target']))

Accuracy 0.6794625719769674


In [32]:
from sklearn.metrics import precision_recall_fscore_support
print(precision_recall_fscore_support(test['Target'],test_pred,average='macro'))

(0.6330645088374329, 0.5798561889389177, 0.5965774270703806, None)
