In [1]:
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive/')

!ls '/content/drive/Shared drives/ADA_assignment_2/A2_data/'

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
abstracts_Vocab.txt		  encoded_data.pickle
arxiv-metadata-oai-snapshot.json  hyperparameter_data
clean_balanced_train_data.csv	  models
clean_full_train_data.csv	  Predictions
clean_test_data.csv		  test_data.csv
encoded_data_450k.pickle	  train_data_labels.csv
encoded_data_500k.pickle	  word2vec
encoded_data_balanced.pickle


### Library load

In [None]:
# Loading necessary Libraries
import re
import nltk
import string
from tqdm import tqdm
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score

### Data Load

In [69]:
train_df = pd.read_csv('/content/drive/Shared drives/ADA_assignment_2/A2_data/train_data_labels.csv', usecols = ['label', 'abstract'])
train_df.head()

Unnamed: 0,abstract,label
0,"save for some special cases, current training ...",cs
1,we consider a dynamical system with finitely m...,math.DS
2,"we consider discrete dynamical systems of ""ant...",cs
3,"retrofitting techniques, which inject external...",cs
4,approaches to decision-making under uncertaint...,cs


In [70]:
train_df.dropna(inplace = True) # Drop any null values in data
train_df = train_df.drop_duplicates(subset=['abstract'], keep = False) # Drop the duplicate abstracts with multiple labels per abstract
train_df = train_df.reset_index(drop = True) # Reset index for better access of rows
train_df.shape

(19746, 2)

In [71]:
# Check the data balance with labels
df1 = train_df['label'].value_counts().reset_index()
df1

Unnamed: 0,index,label
0,cs,8438
1,math.AG,818
2,math.CO,657
3,astro-ph.HE,623
4,math.AP,560
...,...,...
95,physics.pop-ph,3
96,nlin.CG,3
97,physics.atm-clus,2
98,stat.OT,2


In [14]:
len(set(train_df['label']))

100

### Text Pre-processing function

In [15]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

def preprocessing(all_abstracts):

    """
    Take in an array of abstracts, and return the processed abstracts by performing a series of steps
    """

    processed_abstracts = []

    for abstract in tqdm(all_abstracts):

        # remove other non-alphabets tokens (i.e. keep only alphabets and whitespaces).
        abstract = re.sub('[^a-zA-Z ]', '', abstract)

        # convert to lowercase
        abstract_lower = abstract.lower()

        # remove URLs
        abstract_no_url = re.compile(r'https?://\S+|www\.\S+').sub(r'', abstract_lower)

        # remove HTML tags
        abstract_no_html = re.compile(r'<[^>]*>').sub(r'', abstract_no_url)

        # replace the digits with space
        abstract_no_digit = re.sub(r'[^\D\s]', ' ', abstract_no_html)

        # remove punctuation
        # these are the punctuations supplied by python by default - !"#$%&\'()*+,-./:;<=>?@[\\]^_{|}~`
        
        abstract_no_punctuations = ' '.join(re.sub(r'[^\w\s]', ' ', abstract_no_digit).split())
        abstract_no_punctuations = abstract_no_punctuations.replace('_', ' ')

        # Stop Words Removal
        STOPWORDS = set(stopwords.words('english'))
        abstract_no_stopwords = " ".join([word for word in str(abstract_no_punctuations).split() if word not in STOPWORDS])

        # Stemming
        # stemmer = PorterStemmer()
        # abstract_stemmed = " ".join([stemmer.stem(word) for word in abstract_no_stopwords.split()])

        # Lemmatizer with POS tagging
        lemmatizer = WordNetLemmatizer()
        wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}
        pos_tagged_text = nltk.pos_tag(abstract_no_stopwords.split())
        abstract_lemmatized = " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

        processed = abstract_lemmatized

        words = processed.split()

        # keep words that have length of more than 1 (e.g. gb, bb), remove those with length 1.
        processed_abstracts.append(' '.join([word for word in words if len(word) > 1]))

    return processed_abstracts # return list of processed abstracts

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [72]:
train_df['processed_abstract'] = preprocessing(train_df['abstract']) # Pre-process raw abstracts

100%|██████████| 19746/19746 [02:12<00:00, 149.08it/s]


In [17]:
# Generation of inner and outer labels for Hierarchical modelling
label_outer = []
label_inner = []

for ix, row in train_df.iterrows():
    label = row['label']
    label_splits = label.split('.') # Split actual label into 2 parts
    
    if len(label_splits) == 2:
        label_outer.append(label_splits[0]) # Append the high-level label into label_outer
        label_inner.append(label_splits[1]) # Append the sub-level label into label_inner
    else: # Handle 'cs' seperately
        label_outer.append(label_splits[0]) # Append the high-level label into label_outer
        label_inner.append('Nan') # Since no sub-levels for 'cs'
        
train_df['label_outer'] = label_outer
train_df['label_inner'] = label_inner

train_df

Unnamed: 0,abstract,label,processed_abstract,label_outer,label_inner
0,"save for some special cases, current training ...",cs,save special case current training method gene...,cs,Nan
1,we consider a dynamical system with finitely m...,math.DS,consider dynamical system finitely many equili...,math,DS
2,"we consider discrete dynamical systems of ""ant...",cs,consider discrete dynamical system ant like ag...,cs,Nan
3,"retrofitting techniques, which inject external...",cs,retrofit technique inject external resource wo...,cs,Nan
4,approaches to decision-making under uncertaint...,cs,approach decision make uncertainty belief func...,cs,Nan
...,...,...,...,...,...
19741,"with the powerful deep network architectures, ...",cs,powerful deep network architecture generative ...,cs,Nan
19742,we develop a mixed-characteristic version of t...,math.AG,develop mixed characteristic version mori muka...,math,AG
19743,"in complex analysis, the winding number measur...",cs,complex analysis wind number measure number ti...,cs,Nan
19744,we discuss secure computation of modular sum w...,cs,discuss secure computation modular sum multipl...,cs,Nan


In [18]:
# Generate indexing labels for Level-1 classifier
train_df1 = train_df[['label_outer', 'processed_abstract']]
train_df1.loc[:,('label_id')] = train_df1['label_outer'].factorize()[0] # Factorise to get label-encodings of outer-label
label_id_df = train_df1[['label_outer', 'label_id']].drop_duplicates().sort_values('label_id')
label_to_id = dict(label_id_df.values) # Generate a look-up dictionary to get the labels from predictions
id_to_label_outer = dict(label_id_df[['label_id', 'label_outer']].values)
id_to_label_outer

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


{0: 'cs',
 1: 'math',
 2: 'q-fin',
 3: 'cond-mat',
 4: 'astro-ph',
 5: 'q-bio',
 6: 'physics',
 7: 'stat',
 8: 'nlin'}

In [19]:
train_df1 # To be used for Level-1 model

Unnamed: 0,label_outer,processed_abstract,label_id
0,cs,save special case current training method gene...,0
1,math,consider dynamical system finitely many equili...,1
2,cs,consider discrete dynamical system ant like ag...,0
3,cs,retrofit technique inject external resource wo...,0
4,cs,approach decision make uncertainty belief func...,0
...,...,...,...
19741,cs,powerful deep network architecture generative ...,0
19742,math,develop mixed characteristic version mori muka...,1
19743,cs,complex analysis wind number measure number ti...,0
19744,cs,discuss secure computation modular sum multipl...,0


In [20]:
# Generate index labels for Level-2 models
# Also generate multiple dataframes for every class
id_to_label_inner = {}
train_df2 = {}
for label_code, label_outer in id_to_label_outer.items():
    if label_outer == 'cs':
        continue
#     print(label_outer)
    
    temp_df = train_df[train_df['label_outer'] == label_outer].reset_index(drop = True)
    
    temp_df = temp_df[['label_inner', 'processed_abstract']]
    temp_df.loc[:,('label_id')] = temp_df['label_inner'].factorize()[0] # Factorise to get label-encodings of inner-label
    label_id_df = temp_df[['label_inner', 'label_id']].drop_duplicates().sort_values('label_id')
    label_to_id = dict(label_id_df.values) # Generate a look-up dictionary to get the labels from predictions
    id_to_label = dict(label_id_df[['label_id', 'label_inner']].values)
    id_to_label_inner[label_outer] = id_to_label
    train_df2[label_outer] = temp_df


In [21]:
id_to_label_inner['math']

{0: 'DS',
 1: 'LO',
 2: 'CO',
 3: 'OA',
 4: 'NT',
 5: 'FA',
 6: 'AP',
 7: 'RT',
 8: 'AG',
 9: 'AC',
 10: 'IT',
 11: 'CA',
 12: 'GT',
 13: 'SG',
 14: 'DG',
 15: 'AT',
 16: 'GR',
 17: 'CT',
 18: 'RA',
 19: 'KT',
 20: 'OC',
 21: 'MG',
 22: 'HO',
 23: 'PR',
 24: 'CV',
 25: 'MP',
 26: 'NA',
 27: 'QA',
 28: 'GN',
 29: 'GM',
 30: 'SP',
 31: 'ST'}

In [22]:
train_df2['math'] # To be used by Level-2 models based on the label from Level-1

Unnamed: 0,label_inner,processed_abstract,label_id
0,DS,consider dynamical system finitely many equili...,0
1,LO,investigate correspondence complexity hierarch...,1
2,CO,let ge fix constant let mathcal uniform regula...,2
3,OA,show large class countable discrete group sati...,3
4,CO,generalize two main theorem match polynomial u...,2
...,...,...,...
4643,DG,aim paper investigate uniqueness conic constan...,14
4644,AP,show knowledge dirichlet neumann map rough del...,6
4645,DG,paper prove compact manifold einstein metric p...,14
4646,AG,develop mixed characteristic version mori muka...,8


In [23]:
del train_df # Free-up memory

In [26]:
#Generate TFIDF with best custom parameters based on multiple testing
tfidf = TfidfVectorizer(encoding='utf-8',
                        stop_words=None,
                        lowercase=False,
                        max_df=0.3,
                        min_df=10,
                        max_features=20000, 
                        norm='l2',
                        sublinear_tf=True)

tfidf_X = tfidf.fit_transform(train_df1['processed_abstract']).toarray() # Fit and transform the processed abstract

In [27]:
# Perform a 80-20 split on train-data
train_x, test_x, train_y, test_y = train_test_split(tfidf_X, 
                                                   train_df1['label_id'],
                                                   test_size=0.2,
                                                   random_state=11)

In [28]:
# Train the SVM model on the train and test split
np.random.seed(42)
SVM = svm.SVC(kernel='linear', verbose=True)
SVM.fit(train_x,train_y)

[LibSVM]

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=True)

In [29]:
# Testing the model on the test set
predictions_SVM = SVM.predict(test_x)
print("SVM Accuracy Score (Validation) -> ",accuracy_score(predictions_SVM, test_y)*100)

SVM Accuracy Score (Validation) ->  94.32911392405063


In [33]:
# Storing the models and tfidf for future usage in a pickle file.
import pickle
multi_model = {}

tfidfs = {}
models = {}

tfidfs['outer_model'] = tfidf # Level-1 tfidf for outer classes
models['outer_model'] = SVM # Level-1 model for outer classes

multi_model['tfidfs'] = tfidfs
multi_model['models'] = models

# Update the model files into a file for future usage
with open('/content/drive/Shared drives/ADA_assignment_2/A2_data/models/SVM_multimodel_4.pkl','wb') as f:
    pickle.dump(multi_model,f)
  

In [54]:
# Training the data points for every label building multiple Level-2 classifiers

def train_SVM(temp_df, min_df, max_df, max_features, n_gram):
  tfidf = TfidfVectorizer(encoding='utf-8',
                          ngram_range= n_gram,
                          stop_words=None,
                          lowercase=False,
                          max_df= max_df,
                          min_df=min_df,
                          max_features=max_features, 
                          norm='l2',
                          sublinear_tf=True)

  # Generate tfidf on the dataframe passed as input
  tfidf_X = tfidf.fit_transform(temp_df['processed_abstract']).toarray()

  train_x, test_x, train_y, test_y = train_test_split(tfidf_X, 
                                                    temp_df['label_id'],
                                                    test_size=0.2,
                                                    random_state=11)

  np.random.seed(42)
  SVM = svm.SVC(kernel='linear', verbose=False)
  SVM.fit(train_x,train_y) # Fit the sub-label data with SVM

  # The performance is tested on the test set
  predictions_SVM = SVM.predict(test_x)
  print("SVM Accuracy Score (Validation) -> ",accuracy_score(predictions_SVM, test_y)*100)

  return(tfidf, SVM)

In [55]:
# The search for optimised parameters is performed in the final section of this notebook.
# These parameters are optimised based on the data from every label for the best performance

optimised_parameters = {
    'math' : {'min_df' : 3, 'max_df' : 0.1, 'max_features' : 10000, 'n_gram' : (1,1)},
    'q-fin' : {'min_df' : 15, 'max_df' : 0.5, 'max_features' :3000, 'n_gram' : (1,1)},
    'cond-mat' : {'min_df' : 3, 'max_df' : 0.1, 'max_features' :3000, 'n_gram' : (1,1)},
    'astro-ph' : {'min_df' : 3, 'max_df' : 0.5, 'max_features' :5000, 'n_gram' : (1,1)},
    'q-bio' : {'min_df' : 10, 'max_df' : 0.4, 'max_features' :1000, 'n_gram' : (1,1)},
    'physics' : {'min_df' : 3, 'max_df' : 0.3, 'max_features' :3000, 'n_gram' : (1,1)},
    'stat' : {'min_df' : 7, 'max_df' : 0.2, 'max_features' :1000, 'n_gram' : (1,1)},
    'nlin' : {'min_df' : 5, 'max_df' : 0.5, 'max_features' :1000, 'n_gram' : (1,1)}
}
optimised_parameters

{'astro-ph': {'max_df': 0.5,
  'max_features': 5000,
  'min_df': 3,
  'n_gram': (1, 1)},
 'cond-mat': {'max_df': 0.1,
  'max_features': 3000,
  'min_df': 3,
  'n_gram': (1, 1)},
 'math': {'max_df': 0.1, 'max_features': 10000, 'min_df': 3, 'n_gram': (1, 1)},
 'nlin': {'max_df': 0.5, 'max_features': 1000, 'min_df': 5, 'n_gram': (1, 1)},
 'physics': {'max_df': 0.3,
  'max_features': 3000,
  'min_df': 3,
  'n_gram': (1, 1)},
 'q-bio': {'max_df': 0.4,
  'max_features': 1000,
  'min_df': 10,
  'n_gram': (1, 1)},
 'q-fin': {'max_df': 0.5,
  'max_features': 3000,
  'min_df': 15,
  'n_gram': (1, 1)},
 'stat': {'max_df': 0.2, 'max_features': 1000, 'min_df': 7, 'n_gram': (1, 1)}}

In [57]:
# Training Level-2 models
for label, label_df in train_df2.items():
  print(label, ' training...')

  min_df_ = optimised_parameters[label]['min_df']
  max_df_ = optimised_parameters[label]['max_df']
  max_feat_ = optimised_parameters[label]['max_features']
  n_gram_ = optimised_parameters[label]['n_gram']
  
  tfidf_inner, SVM_inner =  train_SVM(label_df, min_df_, max_df_, max_feat_, n_gram_) # Train the model with optimised parameters by calling the train_SVM() function

  tfidfs[label] = tfidf_inner
  models[label] = SVM_inner

  # Models and TFIDF are stored for future usage
  multi_model['tfidfs'] = tfidfs
  multi_model['models'] = models

# All the trained models and TFIDF are stored in a file for possible future usage
with open('/content/drive/Shared drives/ADA_assignment_2/A2_data/models/SVM_multimodel_4.pkl','wb') as f:
  pickle.dump(multi_model,f)

math  training...
SVM Accuracy Score (Validation) ->  63.54838709677419
q-fin  training...
SVM Accuracy Score (Validation) ->  57.009345794392516
cond-mat  training...
SVM Accuracy Score (Validation) ->  67.23404255319149
astro-ph  training...
SVM Accuracy Score (Validation) ->  80.6949806949807
q-bio  training...
SVM Accuracy Score (Validation) ->  65.97510373443983
physics  training...
SVM Accuracy Score (Validation) ->  65.28925619834712
stat  training...
SVM Accuracy Score (Validation) ->  53.96825396825397
nlin  training...
SVM Accuracy Score (Validation) ->  77.77777777777779


# Classification on Original test submission file for Kaggle

In [37]:
test_df = pd.read_csv('/content/drive/Shared drives/ADA_assignment_2/A2_data/test_data.csv')
test_df

Unnamed: 0,test_id,abstract
0,1,the method of model averaging has become an im...
1,2,unmanned aerial vehicle (uav) systems are bein...
2,3,"in this paper, we propose a new loss function ..."
3,4,we show how to integrate a weak morphism of li...
4,5,caustics occur widely in dynamics and take on ...
...,...,...
7405,7406,statistical inference of evolutionary paramete...
7406,7407,we present a deep learning framework based on ...
7407,7408,t-cell receptor (tcr) repertoire data contain ...
7408,7409,"in this paper, we provide a modern synthesis o..."


In [38]:
# Pre-process abstracts
test_df['processed_abstract'] = preprocessing(test_df['abstract'])

100%|██████████| 7410/7410 [00:51<00:00, 144.76it/s]


In [58]:
# Load Level-1 model and TFIDF
tfidf = multi_model['tfidfs']['outer_model']
SVM = multi_model['models']['outer_model']

# Transform the data with the tfidf of the Level-1 model
tfidf_test = tfidf.transform(test_df['processed_abstract']).toarray()

In [59]:
# Make predictions with the Level-1 model and get the high-level classes
outer_predictions = SVM.predict(tfidf_test)
test_df['label'] = [id_to_label_outer[prediction] for prediction in outer_predictions] # Generate actual labels from predictions in a human-readable format
test_df.head()

Unnamed: 0,test_id,abstract,processed_abstract,label
0,1,the method of model averaging has become an im...,method model average become important tool dea...,stat
1,2,unmanned aerial vehicle (uav) systems are bein...,unmanned aerial vehicle uav system increasingl...,cs
2,3,"in this paper, we propose a new loss function ...",paper propose new loss function call generaliz...,cs
3,4,we show how to integrate a weak morphism of li...,show integrate weak morphism lie algebra cross...,math
4,5,caustics occur widely in dynamics and take on ...,caustic occur widely dynamic take shape classi...,cond-mat


In [60]:
final_predictions = []

# Use appropriate Level-2 models based on the predictions from the Level-1 model
for ix, prediction in tqdm(enumerate(outer_predictions)):

  # Skip if the Level-1 prediction is 'cs' since there are no further sub-classification
  if id_to_label_outer[prediction] == 'cs':
    final_predictions.append('cs')
    # print('cs')
    continue
    
  # Load Level-2 classifier model based on the Level-1 prediction
  outer_lab = id_to_label_outer[prediction]
  tfidf = multi_model['tfidfs'][outer_lab]
  SVM = multi_model['models'][outer_lab]

  # Predict the sub-class from the loaded TFIDF and the model
  tfidf_inner = tfidf.transform([test_df.iloc[ix]['processed_abstract']]).toarray()
  inner_prediction = SVM.predict(tfidf_inner)
  inner_lab = id_to_label_inner[outer_lab][inner_prediction[0]] # Generate subclass label by indexing back into a label from a Label_ID

  # print(outer_lab + '.' + inner_lab)
  final_predictions.append(outer_lab + '.' + inner_lab) # Append the final predictions into a single prediction
  

7410it [01:16, 97.46it/s]


In [61]:
# Load the predictions onto the test set
test_df['label'] = final_predictions
test_df.head()

Unnamed: 0,test_id,abstract,processed_abstract,label
0,1,the method of model averaging has become an im...,method model average become important tool dea...,stat.ME
1,2,unmanned aerial vehicle (uav) systems are bein...,unmanned aerial vehicle uav system increasingl...,cs
2,3,"in this paper, we propose a new loss function ...",paper propose new loss function call generaliz...,cs
3,4,we show how to integrate a weak morphism of li...,show integrate weak morphism lie algebra cross...,math.AT
4,5,caustics occur widely in dynamics and take on ...,caustic occur widely dynamic take shape classi...,cond-mat.quant-gas


In [62]:
# Save the file to upload and check the model performance
test_df[['test_id', 'label']].to_csv('/content/drive/Shared drives/ADA_assignment_2/A2_data/Predictions/svc_multi_predictions_v5.csv', index = False)

---

# Supplementary Code
```This is the end of the main content of the notebook. The subsequent sections contain the supporting code pertaining to other supplementary tasks```

### Data augmentation attempt


In [None]:
# train_df_aug = pd.read_csv('/content/drive/Shared drives/ADA_assignment_2/A2_data/clean_balanced_train_data.csv', usecols = ['label', 'abstract']) # If augmenting data for balanced labels
shortlisted_df = pd.DataFrame(columns = ['label', 'abstract'])
limit_per_class = 250 # Decide upon a count for all the calsses

for ix, row in df1.iterrows():

  label = row['index']
  count = row['label']

  if count > limit_per_class: # Sample randomly when there are more observations --> UNDER_SAMPLING
    temp_df = train_df[train_df['label'] == label]
    temp_df = temp_df.sample(n = limit_per_class)

    shortlisted_df = pd.concat([shortlisted_df, temp_df])
  else: # Borrow data from the augmented data if the samples are less in number --> OVER_SAMPLING
    temp_df = train_df[train_df['label'] == label]
    shortlisted_df = pd.concat([shortlisted_df, temp_df])

    temp_df = train_df_aug[train_df_aug['label'] == label]
    temp_df = temp_df.sample(n = (limit_per_class - count))
    shortlisted_df = pd.concat([shortlisted_df, temp_df])

train_df = shortlisted_df.sample(frac=1).reset_index(drop=True) # Shuffle data completely
print(train_df.shape[0])
train_df.head()

### Testing for optimum parameters for all the Level 2 classifiers

In [None]:
#TFIDF
def train_SVM(temp_df, min_df, max_df, max_features, n_grams):
  tfidf = TfidfVectorizer(encoding='utf-8',
                          ngram_range=n_grams,
                          stop_words=None,
                          lowercase=False,
                          max_df=max_df,
                          min_df=min_df,
                          max_features=max_features, 
                          norm='l2',
                          sublinear_tf=True)

  tfidf_X = tfidf.fit_transform(temp_df['processed_abstract']).toarray()

  train_x, test_x, train_y, test_y = train_test_split(tfidf_X, 
                                                    temp_df['label_id'],
                                                    test_size=0.2,
                                                    random_state=11)

  np.random.seed(42)
  SVM = svm.SVC(kernel='linear', verbose=False)
  SVM.fit(train_x,train_y)

  predictions_SVM = SVM.predict(test_x)
  accuracy = accuracy_score(predictions_SVM, test_y)*100
  # print("SVM Accuracy Score (Validation) -> ", accuracy)

  return accuracy

In [None]:
outer_label = 'stat'
temp_df = train_df2[outer_label]

In [None]:
train_SVM(temp_df, 10, 0.3, 3000, (1,1))

48.41269841269841

In [None]:
acc_df = pd.DataFrame(columns = ['min_df', 'max_df', 'max_features', 'n_gram', 'accuracy'])
df_ix = 0

min_dfs = [1, 2,5,8,10,12,15,17,20, 25]
max_dfs = [0.1, 0.3, 0.5, 0.7, 0.9, 1.0]
max_features = [1000, 3000, 5000, 10000, 15000, 20000, 25000, 30000]
n_grams = [(1,1), (1,2), (1,3)]

for min_ in tqdm(min_dfs):
  for max_ in max_dfs:
    for max_feat_ in max_features:
      for gram in n_grams:
        try:
          acc = train_SVM(temp_df, min_, max_, max_feat_, gram)
          acc_df.loc[df_ix] = [min_, max_, max_feat_, gram, acc]
        except:
          acc_df.loc[df_ix] = [min_, max_, max_feat_, gram, 0.0]
        df_ix += 1



  0%|          | 0/10 [00:00<?, ?it/s][A
 10%|█         | 1/10 [22:48<3:25:14, 1368.23s/it][A
 20%|██        | 2/10 [34:08<2:34:54, 1161.83s/it][A
 30%|███       | 3/10 [38:18<1:43:38, 888.29s/it] [A
 40%|████      | 4/10 [41:15<1:07:28, 674.83s/it][A
 50%|█████     | 5/10 [43:48<43:12, 518.45s/it]  [A
 60%|██████    | 6/10 [46:02<26:52, 403.04s/it][A
 70%|███████   | 7/10 [47:56<15:48, 316.22s/it][A
 80%|████████  | 8/10 [49:41<08:25, 252.85s/it][A
 90%|█████████ | 9/10 [51:14<03:24, 204.88s/it][A
100%|██████████| 10/10 [52:33<00:00, 315.37s/it]


In [None]:
acc_df = acc_df.sort_values(['accuracy'], ascending = False).reset_index()
max_val = max(acc_df['accuracy'])
acc_df = acc_df[acc_df['accuracy'] == max_val].sort_values(['max_features', 'min_df', 'max_df', 'n_gram'], ascending = [True, True, True, True])
acc_df.to_csv('/content/drive/Shared drives/ADA_assignment_2/A2_data/hyperparameter_data/' + outer_label + '_inner.csv', index = False)
acc_df

Unnamed: 0,index,min_df,max_df,max_features,n_gram,accuracy
2,364,5,0.7,3000,"(1, 2)",54.761905
9,388,5,0.9,3000,"(1, 2)",54.761905
20,412,5,1.0,3000,"(1, 2)",54.761905
3,367,5,0.7,5000,"(1, 2)",54.761905
10,391,5,0.9,5000,"(1, 2)",54.761905
16,415,5,1.0,5000,"(1, 2)",54.761905
4,370,5,0.7,10000,"(1, 2)",54.761905
11,394,5,0.9,10000,"(1, 2)",54.761905
18,418,5,1.0,10000,"(1, 2)",54.761905
5,373,5,0.7,15000,"(1, 2)",54.761905
