In [0]:
import pandas as pd
import numpy as np

In [0]:
from google.colab import drive
drive.mount('/content/gdrive',force_remount=True)

Mounted at /content/gdrive


In [0]:
!ls '/content/gdrive/My Drive/Innoplexus/train_3PIRKSI'

train.csv


In [0]:
data=pd.read_csv('/content/gdrive/My Drive/Innoplexus/train_3PIRKSI/train.csv')

In [0]:
#Forward Fill
data=data.fillna(method='ffill')

In [0]:
data.columns

Index(['id', 'Doc_ID', 'Sent_ID', 'Word', 'tag'], dtype='object')

In [0]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w,t) for w,t in zip(s["Word"].values.tolist(),s["tag"].values.tolist())]
        self.grouped = self.data.groupby("Sent_ID").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped[(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [0]:
getter = SentenceGetter(data)
sent = getter.get_next()
print(sent)

[('Obesity', 'O'), ('in', 'O'), ('Low-', 'O'), ('and', 'O'), ('Middle-Income', 'O'), ('Countries', 'O'), (':', 'O'), ('Burden', 'O'), (',', 'O'), ('Drivers', 'O'), (',', 'O'), ('and', 'O'), ('Emerging', 'O'), ('Challenges', 'O'), ('.', 'O')]


In [0]:
sentences = getter.sentences

In [0]:
!pip install regex



In [0]:
import re
def shape(word):
    word_shape = 'other'
    if re.match('[0-9]+(\.[0-9]*)?|[0-9]*\.[0-9]+$', word):
        word_shape = 'number'
    elif re.match('\W+$', word):
        word_shape = 'punct'
    elif re.match('[A-Z][a-z]+$', word):
        word_shape = 'capitalized'
    elif re.match('[A-Z]+$', word):
        word_shape = 'uppercase'
    elif re.match('[a-z]+$', word):
        word_shape = 'lowercase'
    elif re.match('[A-Z][a-z]+[A-Z][a-z]+[A-Za-z]*$', word):
        word_shape = 'camelcase'
    elif re.match('[A-Za-z]+$', word):
        word_shape = 'mixedcase'
    elif re.match('__.+__$', word):
        word_shape = 'wildcard'
    elif re.match('[A-Za-z0-9]+\.$', word):
        word_shape = 'ending-dot'
    elif re.match('[A-Za-z0-9]+\.[A-Za-z0-9\.]+\.$', word):
        word_shape = 'abbreviation'
    elif re.match('[A-Za-z0-9]+\-[A-Za-z0-9\-]+.*$', word):
        word_shape = 'contains-hyphen'
 
    return word_shape

In [0]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('english')
def word2features(sent, i):
    word = sent[i][0]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'lemma': stemmer.stem(word),
        'shape': shape(word),
        'lemma[-3:]': stemmer.stem(word[-3:]),
        'shape[-3:]': shape(word[-3:]),
        'lemma[-2:]': stemmer.stem(word[-2:]),
        'shape[-2:]': shape(word[-2:]),

    }
    if i > 0:
        word1 = sent[i-1][0]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token,label in sent]

def sent2tokens(sent):
    return [token for token,label in sent]

In [0]:
X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]

In [0]:
!pip install -U scikit-learn

Requirement already up-to-date: scikit-learn in /usr/local/lib/python3.6/dist-packages (0.20.3)


In [0]:
pip install sklearn-crfsuite



In [0]:
from sklearn_crfsuite import CRF

crf = CRF(algorithm='lbfgs',
          c1=0.1,
          c2=0.1,
          max_iterations=100,
          all_possible_transitions=False)

In [0]:
crf.fit(X, y)

CRF(algorithm='lbfgs', all_possible_states=None,
  all_possible_transitions=False, averaging=None, c=None, c1=0.1, c2=0.1,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
  gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
  max_linesearch=None, min_freq=None, model_filename=None,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=False)

In [0]:
from sklearn_crfsuite import CRF

crf1 = CRF(algorithm='lbfgs',
          c1=1,
          c2=0.1,
          max_iterations=100,
          all_possible_transitions=False)

In [0]:
crf1.fit(X, y)

CRF(algorithm='lbfgs', all_possible_states=None,
  all_possible_transitions=False, averaging=None, c=None, c1=1, c2=0.1,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
  gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
  max_linesearch=None, min_freq=None, model_filename=None,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=False)

In [0]:
from sklearn_crfsuite import CRF

crf2 = CRF(algorithm='lbfgs',
          c1=0.01,
          c2=0.1,
          max_iterations=100,
          all_possible_transitions=False)

In [0]:
crf2.fit(X, y)

CRF(algorithm='lbfgs', all_possible_states=None,
  all_possible_transitions=False, averaging=None, c=None, c1=0.01, c2=0.1,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
  gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
  max_linesearch=None, min_freq=None, model_filename=None,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=False)

In [0]:
from sklearn_crfsuite import CRF

crf3 = CRF(algorithm='lbfgs',
          c1=0.01,
          c2=0.01,
          max_iterations=100,
          all_possible_transitions=False)

In [0]:
crf3.fit(X, y)

CRF(algorithm='lbfgs', all_possible_states=None,
  all_possible_transitions=False, averaging=None, c=None, c1=0.01, c2=0.01,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
  gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
  max_linesearch=None, min_freq=None, model_filename=None,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=False)

In [0]:
from sklearn_crfsuite import CRF

crf4 = CRF(algorithm='lbfgs',
          c1=0.01,
          c2=0.005,
          max_iterations=100,
          all_possible_transitions=False)

In [0]:
crf4.fit(X, y)

CRF(algorithm='lbfgs', all_possible_states=None,
  all_possible_transitions=False, averaging=None, c=None, c1=0.01,
  c2=0.005, calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
  gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
  max_linesearch=None, min_freq=None, model_filename=None,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=False)

In [0]:
testdata=pd.read_csv('/content/gdrive/My Drive/Innoplexus/test_XEV14AD/test.csv')

In [0]:
testdata.head()

Unnamed: 0,id,Doc_ID,Sent_ID,Word
0,4543834,30001,191283,CCCVA
1,4543835,30001,191283,","
2,4543836,30001,191283,MANOVA
3,4543837,30001,191283,","
4,4543838,30001,191283,my


In [0]:
class testSentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 191283
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w) for w in zip(s["Word"].values.tolist())]
        self.grouped = self.data.groupby("Sent_ID").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped[(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [0]:
testgetter=testSentenceGetter(testdata)

In [0]:
testsent = testgetter.get_next()
print(testsent)

[('CCCVA',), (',',), ('MANOVA',), (',',), ('my',), ('black',), ('hen',), ('.',)]


In [0]:
testsentences = testgetter.sentences

In [0]:
def testword2features(sent, i):
    word = str(sent[i][0])

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'lemma': stemmer.stem(word),
        'shape': shape(word),
        'lemma[-3:]': stemmer.stem(word[-3:]),
        'shape[-3:]': shape(word[-3:]),
        'lemma[-2:]': stemmer.stem(word[-2:]),
        'shape[-2:]': shape(word[-2:]),

    }
    if i > 0:
        word1 = str(sent[i-1][0])
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = str(sent[i+1][0])
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
        })
    else:
        features['EOS'] = True

    return features


def testsent2features(sent):
    return [testword2features(sent, i) for i in range(len(sent))]

In [0]:
X_test = [testsent2features(s) for s in testsentences]

In [0]:
from sklearn_crfsuite import CRF

crf5 = CRF(algorithm='lbfgs',
          c1=0.2,
          c2=0.2,
          max_iterations=100,
          all_possible_transitions=False)

In [0]:
crf5.fit(X, y)

CRF(algorithm='lbfgs', all_possible_states=None,
  all_possible_transitions=False, averaging=None, c=None, c1=0.2, c2=0.2,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
  gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
  max_linesearch=None, min_freq=None, model_filename=None,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=False)

In [0]:
y_pred = crf.predict(X_test)

In [0]:
y_pred1 = crf1.predict(X_test)

In [0]:
y_pred2 = crf2.predict(X_test)

In [0]:
y_pred3 = crf3.predict(X_test)

In [0]:
y_pred4 = crf4.predict(X_test)

In [0]:
y_pred5 = crf5.predict(X_test)

In [0]:
print(len(y_pred))
print(len(y_pred1))
print(len(y_pred2))
print(len(y_pred3))
print(len(y_pred4))

125840
125840
125840
125840
125840


In [0]:
print(len(y_pred3))

125840


In [0]:
import numpy as np
myarray = np.asarray(y_pred)
myarray1=np.asarray(y_pred1)
myarray2=np.asarray(y_pred2)

In [0]:
myarray = np.asarray(y_pred)

In [0]:
myarray3=np.asarray(y_pred3)

In [0]:
myarray4=np.asarray(y_pred4)

In [0]:
myarray5=np.asarray(y_pred5)

In [0]:
testdata.shape

(2994463, 4)

In [0]:
predictions=np.chararray((2994463,1))
k=0
for i in range(myarray.shape[0]):
  for j in range(len(myarray[i])):
    predictions[k]=myarray[i][j]
    k=k+1

In [0]:
predictions1=np.chararray((2994463,1))
k=0
for i in range(myarray1.shape[0]):
  for j in range(len(myarray1[i])):
    predictions1[k]=myarray1[i][j]
    k=k+1

In [0]:
predictions2=np.chararray((2994463,1))
k=0
for i in range(myarray2.shape[0]):
  for j in range(len(myarray2[i])):
    predictions2[k]=myarray2[i][j]
    k=k+1

In [0]:
predictions3=np.chararray((2994463,1))
k=0
for i in range(myarray3.shape[0]):
  for j in range(len(myarray3[i])):
    predictions3[k]=myarray3[i][j]
    k=k+1

In [0]:
predictions4=np.chararray((2994463,1))
k=0
for i in range(myarray4.shape[0]):
  for j in range(len(myarray4[i])):
    predictions4[k]=myarray4[i][j]
    k=k+1

In [0]:
predictions5=np.chararray((2994463,1))
k=0
for i in range(myarray5.shape[0]):
  for j in range(len(myarray5[i])):
    predictions5[k]=myarray5[i][j]
    k=k+1

In [0]:
predict=pd.DataFrame(predictions)
predict1=pd.DataFrame(predictions1)
predict2=pd.DataFrame(predictions2)

In [0]:
predict=pd.DataFrame(predictions)

In [0]:
predict3=pd.DataFrame(predictions3)

In [0]:
predict4=pd.DataFrame(predictions4)

In [0]:
predict5=pd.DataFrame(predictions5)

In [0]:
predict.head()

Unnamed: 0,0
0,b'O'
1,b'O'
2,b'O'
3,b'O'
4,b'O'


In [0]:
predict1.head()

Unnamed: 0,0
0,b'O'
1,b'O'
2,b'O'
3,b'O'
4,b'O'


In [0]:
predict2.head()

Unnamed: 0,0
0,b'O'
1,b'O'
2,b'O'
3,b'O'
4,b'O'


In [0]:
predict3.head()

Unnamed: 0,0
0,b'O'
1,b'O'
2,b'O'
3,b'O'
4,b'O'


In [0]:
predict4.head()

Unnamed: 0,0
0,b'O'
1,b'O'
2,b'O'
3,b'O'
4,b'O'


In [0]:
predict5.head()

Unnamed: 0,0
0,b'O'
1,b'O'
2,b'O'
3,b'O'
4,b'O'


In [0]:
predict['id']=testdata['id']
predict['Sent_ID']=testdata['Sent_ID']
predict['tag']=predict[0]

In [0]:
predict1['id']=testdata['id']
predict1['Sent_ID']=testdata['Sent_ID']
predict1['tag']=predict1[0]

In [0]:
predict2['id']=testdata['id']
predict2['Sent_ID']=testdata['Sent_ID']
predict2['tag']=predict2[0]

In [0]:
predict3['id']=testdata['id']
predict3['Sent_ID']=testdata['Sent_ID']
predict3['tag']=predict3[0]

In [0]:
predict4['id']=testdata['id']
predict4['Sent_ID']=testdata['Sent_ID']
predict4['tag']=predict4[0]

In [0]:
predict5['id']=testdata['id']
predict5['Sent_ID']=testdata['Sent_ID']
predict5['tag']=predict5[0]

In [0]:
del predict5[0]

In [0]:
del predict4[0]

In [0]:
del predict3[0]

In [0]:
del predict[0]

In [0]:
del predict[0]
del predict1[0]
del predict2[0]

In [0]:
predict['tag'].value_counts()

b'O'    2937513
b'B'      30249
b'I'      26701
Name: tag, dtype: int64

In [0]:
predict1['tag'].value_counts()

b'O'    2939925
b'B'      28868
b'I'      25670
Name: tag, dtype: int64

In [0]:
predict2['tag'].value_counts()

b'O'    2937008
b'B'      30280
b'I'      27175
Name: tag, dtype: int64

In [0]:
predict3['tag'].value_counts()

b'O'    2935837
b'B'      31110
b'I'      27516
Name: tag, dtype: int64

In [0]:
predict4['tag'].value_counts()

b'O'    2934633
b'B'      31474
b'I'      28356
Name: tag, dtype: int64

In [0]:
predict5['tag'].value_counts()

b'O'    2938480
b'B'      29770
b'I'      26213
Name: tag, dtype: int64

In [0]:
ense=pd.DataFrame({'tag1':predict['tag'],'tag4':predict3['tag'],'tag6':predict5['tag']})

In [0]:
ense.head()

Unnamed: 0,tag1,tag4,tag6
0,b'O',b'O',b'O'
1,b'O',b'O',b'O'
2,b'O',b'O',b'O'
3,b'O',b'O',b'O'
4,b'O',b'O',b'O'


In [0]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(ense['tag1'])
ense['tag1']=le.transform(ense['tag1'])
ense['tag4']=le.transform(ense['tag4'])
ense['tag6']=le.transform(ense['tag6'])

In [0]:
ense.shape

(2994463, 3)

In [0]:
final_predict=pd.DataFrame({'id':testdata['id'],'Sent_ID':testdata['Sent_ID']})

In [0]:
a=final_predict.mode(axis=1)

In [0]:
 predict.to_csv('sub13.csv', index=False)

In [0]:
 predict5.to_csv('sub15.csv', index=False)

In [0]:
 predict1.to_csv('sub8.csv', index=False)

In [0]:
 predict2.to_csv('sub9.csv', index=False)

In [0]:
 predict3.to_csv('sub14.csv', index=False)

In [0]:
 predict4.to_csv('sub11.csv', index=False)