In [None]:
import pickle
import numpy as np
from google.colab import drive

drive.mount('/content/drive')
corpus = '/content/hindi-token-bis-pos-conll-treebank.txt'

with open ("./embeddings.pickle", "rb") as f:
  e = pickle.load(f)
  embeddings = e['embeddings']
  dic = e['dictionary']

def preprocess_data(with_pos_tag = False):
  total_states = []
  f = open(corpus, "r",encoding = 'utf-8')
  lines = f.readlines()
  line = ""
  for word in lines:
    word = word.split('\t')

    if(len(word) != 1):
      tag = word[1].strip()
      word = word[0].strip()
      
      if(with_pos_tag == False):
        total_states.append(word)
      else:
        total_states.append(word + " " + tag)

  train_size = (len(total_states) * 80)//100

  train_set = total_states[:train_size]
  test_set = total_states[train_size:]

  X_train = [embeddings[dic[word]] if word in dic.keys() else embeddings[dic['UNK']] for word in train_set]
  X_test = [embeddings[dic[word]] if word in dic.keys() else embeddings[dic['UNK']] for word in test_set]

  X_train = np.array(X_train)
  X_test = np.array(X_test)
  return train_set, test_set, X_train, X_test

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
def measure_accuracy(train_set, test_set, Y_train, Y_test, with_pos_tag = True):
  file2 = open('hin-token-chunk-conll-treebank.txt', 'r')
  result_lines = file2.readlines()
  tags = {}

  for line in result_lines:
    word = line.split('\t')
    if(len(word) != 1):
      token = word[0].strip()
      word = word[1].strip()
      if(word[0] == 'B'):
        tags[token] = 0

      else:
        tags[token] = 1

  if(with_pos_tag == True):
    for i in range(len(train_set)):
      train_set[i] = train_set[i].split()[0]

    for i in range(len(test_set)):
      test_set[i] = test_set[i].split()[0]


  total_correct = 0
  for i in range(len(Y_train)):
    if(Y_train[i] == tags[train_set[i]]):
      total_correct += 1

  print("Training Accuracy is : ", max(100 - (total_correct / len(train_set)*100), (total_correct / len(train_set)*100)))
  train_accuracy = max(100 - (total_correct / len(train_set)*100), (total_correct / len(train_set)*100))
  total_correct = 0
  for i in range(len(test_set)):
    if(Y_test[i] == tags[test_set[i]]):
      total_correct += 1

  print("Testing Accuracy is : ", max(100 - (total_correct / len(test_set)*100), (total_correct / len(test_set)*100)))
  test_accuracy = max(100 - (total_correct / len(test_set)*100), (total_correct / len(test_set)*100))
  return train_accuracy, test_accuracy


In [None]:
train_set, test_set, X_train, X_test = preprocess_data()

In [None]:
from sklearn.cluster import KMeans
modelkmeans = KMeans(n_clusters=2, init='k-means++', n_init=10, n_jobs = -1)
modelkmeans.fit(X_train)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=2, n_init=10, n_jobs=-1, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [None]:
Y_train = modelkmeans.predict(X_train)
Y_test = modelkmeans.predict(X_test)

In [None]:
measure_accuracy(train_set, test_set, Y_train, Y_test)

Training Accuracy is :  50.29920308409332
Testing Accuracy is :  51.145020598863034


(50.29920308409332, 51.145020598863034)

In [None]:
!pip install hmmlearn

Collecting hmmlearn
[?25l  Downloading https://files.pythonhosted.org/packages/4b/98/a2829aeb942b7146034d497afb3fc738a78a4fbd4797a039c19a94bb31f7/hmmlearn-0.2.5-cp37-cp37m-manylinux1_x86_64.whl (369kB)
[K     |▉                               | 10kB 13.3MB/s eta 0:00:01[K     |█▊                              | 20kB 12.7MB/s eta 0:00:01[K     |██▋                             | 30kB 8.5MB/s eta 0:00:01[K     |███▌                            | 40kB 7.4MB/s eta 0:00:01[K     |████▍                           | 51kB 4.2MB/s eta 0:00:01[K     |█████▎                          | 61kB 4.7MB/s eta 0:00:01[K     |██████▏                         | 71kB 4.9MB/s eta 0:00:01[K     |███████                         | 81kB 5.1MB/s eta 0:00:01[K     |████████                        | 92kB 5.2MB/s eta 0:00:01[K     |████████▉                       | 102kB 5.5MB/s eta 0:00:01[K     |█████████▊                      | 112kB 5.5MB/s eta 0:00:01[K     |██████████▋                     | 1

In [None]:
from hmmlearn import hmm

model = hmm.GaussianHMM(n_components=2)
model.fit(X_train)

GaussianHMM(algorithm='viterbi', covariance_type='diag', covars_prior=0.01,
            covars_weight=1, init_params='stmc', means_prior=0, means_weight=0,
            min_covar=0.001, n_components=2, n_iter=10, params='stmc',
            random_state=None, startprob_prior=1.0, tol=0.01,
            transmat_prior=1.0, verbose=False)

In [None]:
Y_train = model.predict(X_train)
Y_test = model.predict(X_test)

In [None]:
measure_accuracy(train_set, test_set, Y_train, Y_test)

Training Accuracy is :  50.29920308409333
Testing Accuracy is :  51.145020598863034


(50.29920308409333, 51.145020598863034)

In [None]:
import pickle
pickle.dump(modelkmeans, open("model_embedding_without_pos_kmeans.pkl", "wb"))
pickle.dump(model, open("model_embedding_without_pos_hmm.pkl", "wb"))

In [None]:
from sklearn.decomposition import PCA

train_accuracy = []
test_accuracy = []
for components in range(1, 129):
  trying_vector = X_train.copy()
  pca = PCA(n_components=components)
  pca.fit(trying_vector)
  trying_vector = pca.transform(trying_vector)
  modelkmeans = KMeans(n_clusters=2, init='k-means++', n_init=10, n_jobs = -1)
  modelkmeans.fit(trying_vector)
  train_results = modelkmeans.predict(trying_vector)
  testing_vector = X_test.copy()
  testing_vector = pca.transform(testing_vector)
  test_results = modelkmeans.predict(testing_vector)
  train_acc, test_acc = measure_accuracy(train_set, test_set, train_results, test_results)
  train_accuracy.append(train_acc)
  test_accuracy.append(test_acc)
  if(components % 50 == 0):
    print(components)

Training Accuracy is :  50.29920308409333
Testing Accuracy is :  51.145020598863034
Training Accuracy is :  50.29920308409333
Testing Accuracy is :  51.145020598863034
Training Accuracy is :  50.29920308409333
Testing Accuracy is :  51.145020598863034
Training Accuracy is :  50.29920308409333
Testing Accuracy is :  51.145020598863034
Training Accuracy is :  50.29920308409333
Testing Accuracy is :  51.145020598863034
Training Accuracy is :  50.29920308409333
Testing Accuracy is :  51.145020598863034
Training Accuracy is :  50.29920308409333
Testing Accuracy is :  51.145020598863034
Training Accuracy is :  50.29920308409333
Testing Accuracy is :  51.145020598863034
Training Accuracy is :  50.29920308409333
Testing Accuracy is :  51.145020598863034
Training Accuracy is :  50.29920308409332
Testing Accuracy is :  51.145020598863034
Training Accuracy is :  50.29920308409333
Testing Accuracy is :  51.145020598863034
Training Accuracy is :  50.29920308409332
Testing Accuracy is :  51.14502059

ValueError: ignored

In [None]:
import matplotlib.pyplot as plt

x = [i for i in range(1, 129)]
y = train_accuracy
y2 = test_accuracy
plt.plot(x, y, label = "train-accuracy")
plt.plot(x, y2, label = "test-accuracy")

plt.xlabel('Number of Components')
plt.ylabel('Accuracy')
plt.title('PCA Prediction!')

plt.legend()
plt.show()
plt.savefig('Pretrained_embedding_kmean_without_pos.png')

In [None]:

train_accuracy = []
test_accuracy = []
for components in range(1, 200):
  trying_vector = X_train.copy()
  pca = PCA(n_components=components)
  pca.fit(trying_vector)
  trying_vector = pca.transform(trying_vector)
  modelkmeans = hmm.GaussianHMM(n_components=2)
  modelkmeans.fit(trying_vector)
  train_results = modelkmeans.predict(trying_vector)
  testing_vector = X_test.copy()
  testing_vector = pca.transform(testing_vector)
  test_results = modelkmeans.predict(testing_vector)
  train_acc, test_acc = measure_accuracy(train_set, test_set, train_results, test_results)
  train_accuracy.append(train_acc)
  test_accuracy.append(test_acc)
  if(components % 50 == 0):
    print(components)

Training Accuracy is :  50.29920308409333
Testing Accuracy is :  51.145020598863034
Training Accuracy is :  50.29920308409333
Testing Accuracy is :  51.145020598863034
Training Accuracy is :  50.29920308409333
Testing Accuracy is :  51.145020598863034
Training Accuracy is :  50.29920308409333
Testing Accuracy is :  51.145020598863034
Training Accuracy is :  50.29920308409333
Testing Accuracy is :  51.145020598863034
Training Accuracy is :  50.29920308409332
Testing Accuracy is :  51.145020598863034
Training Accuracy is :  50.29920308409333
Testing Accuracy is :  51.145020598863034
Training Accuracy is :  50.29920308409333
Testing Accuracy is :  51.145020598863034
Training Accuracy is :  50.29920308409333
Testing Accuracy is :  51.145020598863034
Training Accuracy is :  50.29920308409333
Testing Accuracy is :  51.145020598863034
Training Accuracy is :  50.29920308409333
Testing Accuracy is :  51.145020598863034
Training Accuracy is :  50.29920308409333
Testing Accuracy is :  51.14502059

ValueError: ignored

In [None]:
import matplotlib.pyplot as plt

x = [i for i in range(1, 200)]
y = train_accuracy
y2 = test_accuracy
plt.plot(x, y, label = "train-accuracy")
plt.plot(x, y2, label = "test-accuracy")

plt.xlabel('Number of Components')
plt.ylabel('Accuracy')
plt.title('PCA Prediction!')

plt.legend()
plt.show()
plt.savefig('Pretrained_embedding_hmm_without_pos.png')

In [None]:
train_set, test_set, X_train, X_test = preprocess_data(with_pos_tag = True)

In [None]:
modelkmeans = KMeans(n_clusters=2, init='k-means++', n_init=10, n_jobs = -1)
modelkmeans.fit(X_train)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=2, n_init=10, n_jobs=-1, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [None]:
Y_train = modelkmeans.predict(X_train)
Y_test = modelkmeans.predict(X_test)

In [None]:
measure_accuracy(train_set, test_set, Y_train, Y_test, with_pos_tag = True)

Training Accuracy is :  50.29920308409333
Testing Accuracy is :  51.145020598863034


(50.29920308409333, 51.145020598863034)

In [None]:
model = hmm.GaussianHMM(n_components=2)
model.fit(X_train)

GaussianHMM(algorithm='viterbi', covariance_type='diag', covars_prior=0.01,
            covars_weight=1, init_params='stmc', means_prior=0, means_weight=0,
            min_covar=0.001, n_components=2, n_iter=10, params='stmc',
            random_state=None, startprob_prior=1.0, tol=0.01,
            transmat_prior=1.0, verbose=False)

In [None]:
Y_train = model.predict(X_train)
Y_test = model.predict(X_test)

In [None]:
measure_accuracy(train_set, test_set, Y_train, Y_test)

Training Accuracy is :  50.29920308409332
Testing Accuracy is :  51.145020598863034


(50.29920308409332, 51.145020598863034)

In [None]:
import pickle
pickle.dump(modelkmeans, open("model_embedding_with_pos_kmeans.pkl", "wb"))
pickle.dump(model, open("model_embedding_with_pos_hmm.pkl", "wb"))

In [None]:
from sklearn.decomposition import PCA

train_accuracy = []
test_accuracy = []
for components in range(1, 200):
  trying_vector = X_train.copy()
  pca = PCA(n_components=components)
  pca.fit(trying_vector)
  trying_vector = pca.transform(trying_vector)
  modelkmeans = KMeans(n_clusters=2, init='k-means++', n_init=10, n_jobs = -1)
  modelkmeans.fit(trying_vector)
  train_results = modelkmeans.predict(trying_vector)
  testing_vector = X_test.copy()
  testing_vector = pca.transform(testing_vector)
  test_results = modelkmeans.predict(testing_vector)
  train_acc, test_acc = measure_accuracy(train_set, test_set, train_results, test_results)
  train_accuracy.append(train_acc)
  test_accuracy.append(test_acc)
  if(components % 50 == 0):
    print(components)

Training Accuracy is :  50.29920308409333
Testing Accuracy is :  51.145020598863034
Training Accuracy is :  50.29920308409333
Testing Accuracy is :  51.145020598863034
Training Accuracy is :  50.29920308409333
Testing Accuracy is :  51.145020598863034
Training Accuracy is :  50.29920308409333
Testing Accuracy is :  51.145020598863034
Training Accuracy is :  50.29920308409333
Testing Accuracy is :  51.145020598863034
Training Accuracy is :  50.29920308409332
Testing Accuracy is :  51.145020598863034
Training Accuracy is :  50.29920308409332
Testing Accuracy is :  51.145020598863034
Training Accuracy is :  50.29920308409333
Testing Accuracy is :  51.145020598863034
Training Accuracy is :  50.29920308409332
Testing Accuracy is :  51.145020598863034
Training Accuracy is :  50.29920308409332
Testing Accuracy is :  51.145020598863034
Training Accuracy is :  50.29920308409332
Testing Accuracy is :  51.145020598863034
Training Accuracy is :  50.29920308409333
Testing Accuracy is :  51.14502059

KeyboardInterrupt: ignored

In [None]:
import matplotlib.pyplot as plt

x = [i for i in range(1, 200)]
y = train_accuracy
y2 = test_accuracy
plt.plot(x, y, label = "train-accuracy")
plt.plot(x, y2, label = "test-accuracy")

plt.xlabel('Number of Components')
plt.ylabel('Accuracy')
plt.title('PCA Prediction!')

plt.legend()
plt.show()
plt.savefig('Pretrained_embedding_kmean_with_pos.png')

In [None]:

train_accuracy = []
test_accuracy = []
for components in range(1, 200):
  trying_vector = X_train.copy()
  pca = PCA(n_components=components)
  pca.fit(trying_vector)
  trying_vector = pca.transform(trying_vector)
  modelkmeans = hmm.GaussianHMM(n_components=2)
  modelkmeans.fit(trying_vector)
  train_results = modelkmeans.predict(trying_vector)
  testing_vector = X_test.copy()
  testing_vector = pca.transform(testing_vector)
  test_results = modelkmeans.predict(testing_vector)
  train_acc, test_acc = measure_accuracy(train_set, test_set, train_results, test_results)
  train_accuracy.append(train_acc)
  test_accuracy.append(test_acc)
  if(components % 50 == 0):
    print(components)

In [None]:
import matplotlib.pyplot as plt

x = [i for i in range(1, 200)]
y = train_accuracy
y2 = test_accuracy
plt.plot(x, y, label = "train-accuracy")
plt.plot(x, y2, label = "test-accuracy")

plt.xlabel('Number of Components')
plt.ylabel('Accuracy')
plt.title('PCA Prediction!')

plt.legend()
plt.show()
plt.savefig('Pretrained_embedding_hmm_without_pos.png')