The code below was used to examine how different values affect the accuracy of the prediction.
```bash
from sklearn.metrics import accuracy_score
import itertools
import numpy as np

def grid_search_doc2vec_parameters(tagged_data, df_train, df_test, y_test, param_grid):
    best_accuracy = 0
    best_params = {}
    all_results = []

    for vector_size, window, min_count, epochs in itertools.product(*param_grid.values()):
        print(f"Training Doc2Vec with vector_size={vector_size}, window={window}, min_count={min_count}, epochs={epochs}...")
        
        # Train Doc2Vec model
        model = train_doc2vec_model(tagged_data, vector_size=vector_size, window=window, min_count=min_count, epochs=epochs, workers=4)
        
        # Infer vectors for training and testing datasets
        vectors_train = infer_vectors(model, tagged_data)
        X_train = vectors_train
        y_train = df_train['label']
        
        vectors_test = infer_vectors(model, create_tagged_document(df_test))
        X_test = vectors_test
        
        # Train Logistic Regression model
        clf = LogisticRegression(random_state=0, max_iter=1000)
        clf.fit(X_train, y_train)
        
        # Predict and evaluate
        y_pred = clf.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        
        print(f"Accuracy: {accuracy}")
        
        # Keep track of the best parameters
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_params = {'vector_size': vector_size, 'window': window, 'min_count': min_count, 'epochs': epochs}
        
        # Store all results
        all_results.append((accuracy, vector_size, window, min_count, epochs))

    print(f"Best Accuracy: {best_accuracy}")
    print(f"Best Parameters: {best_params}")
    return best_params, all_results

# Define your parameter grid
param_grid = {
    'vector_size': [768, 400, 500, 300],  # Example sizes
    'window': [2],           # Example window sizes
    'min_count': [2],        # Example min_count values
    'epochs': [20]          # Example epoch counts
}

# Run the grid search
best_params, all_results = grid_search_doc2vec_parameters(tagged_data_train, df_train, df_test, df_test['label'], param_grid)
```

### Execute the whole notebook to see the results. 

# Setup

In [1]:
# Load pandas and numpy
import pandas as pd
import numpy as np

# For text preprocessing
import spacy
nlp = spacy.load('en_core_web_sm')

# For text vectorization we will use Doc2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

# For the classifier we will use Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# For evaluation we will use accuracy, f1-score, precesion, recall and confusion matrix
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix




In [2]:
# Load the datasets for training and testing
filepath_train = '/Users/thebekhruz/Desktop/nlu/EvidenceExplorer/data/train/train.csv'
filepath_test = '/Users/thebekhruz/Desktop/nlu/EvidenceExplorer/data/validate/dev.csv'

df_train = pd.read_csv(filepath_train)
df_test = pd.read_csv(filepath_test)

# Check the first 5 rows of the training dataset
df_train.head()


Unnamed: 0,Claim,Evidence,label
0,We should legalize the growing of coca leaf,"Robert W. Sweet, a federal judge, strongly agr...",1
1,We should ban trans fats usage in food,The net increase in LDL/HDL ratio with trans f...,1
2,We should legalize prostitution,"Pertaining to health, safety and services, the...",0
3,We should subsidize investigative journalism,"Date granted: 10 June 2002 Citation: ""For serv...",0
4,We should abolish homework,The Yarrabah community has a public library wh...,0


# Preprocessing

In [8]:
# Function to create tagged documents
# This is required for Doc2Vec to train the model

# We will use spaCy to tokenize and preprocess the text and then create tagged documents
# Each document is tagged with the index of the row in the dataframe


def create_tagged_document(df):
    tagged_data = []
    for i, text in enumerate(df['Claim'] + ' ' + df['Evidence']):
        # Process the text with the spaCy language model
        doc = nlp(text)
        # Tokenize and lemmatize the text, removing stopwords
        tokens = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct]
        # Create a TaggedDocument for each row in the dataframe
        tagged_data.append(TaggedDocument(words=tokens, tags=[str(i)]))  # Tags are typically strings
    return tagged_data


tagged_data_train = create_tagged_document(df_train)
tagged_data_test = create_tagged_document(df_test)


# Save the tagged documents to disk
import pickle

with open('tagged_data_train.pkl', 'wb') as f:
    pickle.dump(tagged_data_train, f)

with open('tagged_data_test.pkl', 'wb') as f:
    pickle.dump(tagged_data_test, f)



# Takes around 6 min


# Training Doc2Vec Model

### Finding the optimal parameters.

In [4]:
# Instantiate a Doc2Vec model
# We will use a simple model with a vector size of 100 and a window size of 2
# We will train the model for 20 epochs

def train_doc2vec_model(tagged_data, vector_size=768, window=2, min_count=1, epochs=20, workers=4):
    model = Doc2Vec(vector_size=vector_size, window=window, min_count=min_count, workers=workers, epochs=epochs)
    model.build_vocab(tagged_data)
    model.train(tagged_data, total_examples=model.corpus_count, epochs=epochs)
    return model

# Call the funciton to train the model
# model = train_doc2vec_model(tagged_data_train)




## Generate Emeddings

In [5]:
# Infer the vectors for the training data
def infer_vectors(model, tagged_documents):
    vectors = [model.infer_vector(doc.words) for doc in tagged_documents]
    return np.array(vectors)

# Infer the vectors for the training data
# vectors_train = infer_vectors(model, tagged_data_train)
# vectors_train.shape


## Train Logistic Regression Function

In [6]:
# # Exctract features and labels
# X = vectors_train
# y = df_train['label']


# # Train Logistic Regression model
# clf = LogisticRegression(random_state=0, max_iter=1000)
# clf.fit(X, y)


# Evaluating the Model


In [7]:
from sklearn.metrics import accuracy_score
import itertools
import numpy as np
from sklearn.linear_model import LogisticRegression

def grid_search_doc2vec_parameters(tagged_data_train, tagged_data_test, y_train, y_test, param_grid):
    best_accuracy = 0
    best_params = {}
    all_results = []

    # Precompute the test vectors since the test set doesn't change during parameter tuning
    # Initialize a model for infer_vector method availability; parameters here are not critical
    model_for_infer = Doc2Vec(vector_size=100, window=2, min_count=1, workers=4, epochs=20)
    model_for_infer.build_vocab(tagged_data_train)  # Just to prepare the model for inference
    vectors_test = np.array([model_for_infer.infer_vector(doc.words) for doc in tagged_data_test])

    for vector_size, window, min_count, epochs in itertools.product(*param_grid.values()):
        print(f"Training Doc2Vec with vector_size={vector_size}, window={window}, min_count={min_count}, epochs={epochs}...")
        
        # Train Doc2Vec model
        model = train_doc2vec_model(tagged_data_train, vector_size=vector_size, window=window, min_count=min_count, epochs=epochs, workers=4)
        
        # Infer vectors for training datasets
        vectors_train = np.array([model.infer_vector(doc.words) for doc in tagged_data_train])
        
        # Train Logistic Regression model
        clf = LogisticRegression(random_state=0, max_iter=1000)
        clf.fit(vectors_train, y_train)
        
        # Predict and evaluate
        y_pred = clf.predict(vectors_test)
        accuracy = accuracy_score(y_test, y_pred)
        
        print(f"Accuracy: {accuracy}")
        
        # Keep track of the best parameters
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_params = {'vector_size': vector_size, 'window': window, 'min_count': min_count, 'epochs': epochs}
        
        # Store all results
        all_results.append((accuracy, vector_size, window, min_count, epochs))

    print(f"Best Accuracy: {best_accuracy}")
    print(f"Best Parameters: {best_params}")
    return best_params, all_results

# Assuming tagged_data_train and tagged_data_test are already created
y_train = df_train['label']
y_test = df_test['label']

# Define your parameter grid
param_grid = {
    'vector_size': [512],  # Example sizes
    'window': [2,4,6],         # Example window sizes
    'min_count': [2,4,6],      # Example min_count values
    'epochs': [20]         # Example epoch counts
}

# Run the grid search
best_params, all_results = grid_search_doc2vec_parameters(tagged_data_train, tagged_data_test, y_train, y_test, param_grid)


NameError: name 'tagged_data_test' is not defined