In [None]:
import json
import csv
import requests
import pandas as pd
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('all-MiniLM-L6-v2')                  # loading sentence transformer model

'''
def fetch_api(url):                                              # function to fetch json data from API endpoint
    response = requests.get(url)                                 # sending GET request to API endpoint
    if response.status_code == 200:                              # checking if request was successful
        return response.json()                                   # parse and return json data
    else:
        response.raise_for_status()                              # raise exception if request failed
'''

def load_data(file_path):                                        # this block is to be removed later
    with open(file_path, 'r', encoding='utf-8') as a:
        return json.load(a)

def is_valid_entry(entry):                                       # function to check if a single entry is valid
    coin = entry.get('coin', '').lower()
    query = entry.get('query', '').lower()
    answers = entry.get('answers', [])

    if coin == 'general crypto':                                 # always valid if coin is "general crypto"
        return True

    if not query or not answers:                                 # invalid if query or answers are missing
        return False

    coin_in_query = coin in query                                # check if coin is in query

    coin_in_answers = any(coin in ans.get('text', '').lower() for ans in answers)                            # check if coin is in any answer text

    coin_embedding = model.encode(coin, convert_to_tensor=True)                                              # semantic similarity check between coin and answers
    answer_texts = [ans.get('text', '') for ans in answers]
    answer_embeddings = model.encode(answer_texts, convert_to_tensor=True)
    cosine_scores = util.cos_sim(coin_embedding, answer_embeddings)[0]

    threshold = 0.5                                                                                          # threshold for semantic similarity, to be changed
    similarity_valid = any(score >= threshold for score in cosine_scores)

    if coin_in_answers or similarity_valid:                      # validation logic combining keyword and semantic similarity, entry is valid if coin is mentioned in answers or passes similarity threshold
      return True
    else:
        return False

def save_results(results, csv_path):
    with open(csv_path, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['coin', 'query','status', 'answers'])
        for coin, query, status, answers in results:

            answer_texts = [answer.get('text', '') for answer in answers]                                     # Extract first text from each answer dict
            text = answer_texts[0] if len(answer_texts) > 0 else ''

            answers_str = f"text: {text}"                                                                     # Format as single string with labels
            writer.writerow([coin, query, status,answers_str])

def main():

#   url = ""
#   data = fetch_api(url)

    data = load_data('/content/query_logs (2).json')                                # this line is to be removed later
    results = []                                                                                               # list to store results
    for entry in data:                                                                                         # validate each entry
        valid = is_valid_entry(entry)
        status = 'valid' if valid else 'invalid'
        results.append((entry.get('coin', ''), entry.get('query', ''), status, entry.get('answers', [])))      # Save coin, query, status, and answers as is from JSON in correct order
    save_results(results, 'validation_results.csv')                                                            # save results to a CSV file

if __name__ == '__main__':
    main()

df = pd.read_csv('validation_results.csv')
df.head()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Unnamed: 0,coin,query,status,answers
0,solana,can i trade in this coin?,invalid,text:
1,dogecoin,trends,invalid,text:
2,solana,can i trade in this coin?,invalid,"text: - Bitcoin trades at $80,378, down 2.51%,..."
3,ethereum,trends,invalid,text: This further acts as a catalyst for bear...
4,dogecoin,trends,invalid,text: This further acts as a catalyst for bear...


In [None]:

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
import pandas as pd

try:
    df = pd.read_csv("validation_results.csv")
except FileNotFoundError:
    print("Error: validation_results.csv not found")
    exit()

df['coin'] = df['coin'].astype(str)
df['query'] = df['query'].astype(str)
df['answers'] = df['answers'].str.replace('text: ', '')

df['text'] = df['coin'] + ' ' + \
               df['query'] + ' ' + \
               df['answers']

label_encoder = LabelEncoder()
df['status_encoded'] = label_encoder.fit_transform(df['status'])

print("Encoded labels:", df['status_encoded'].unique())

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['text'])

print("Shape of TF-IDF matrix:", tfidf_matrix.shape)
# print("TF-IDF matrix\n", tfidf_matrix)

dense_array = tfidf_matrix.toarray()  # shape (n_samples, n_features)

feature_names = tfidf_vectorizer.get_feature_names_out()
vectorised_data = pd.DataFrame(dense_array, columns=feature_names)

# Add the encoded status column to the vectorised data
vectorised_data['status_encoded'] = df['status_encoded']

vectorised_data.to_csv("vectorised.csv", index=False)
vectorised_data.to_csv("/content/validation_results.csv", index=False)

df = pd.read_csv('vectorised.csv')
df.head()
df.head()

Encoded labels: [0 1]
Shape of TF-IDF matrix: (63, 129)


Unnamed: 0,000,100,110,120,130,1726,2000,2009,2025,2026,...,two,unveil,value,what,why,with,world,would,year,status_encoded
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.353197,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.173674,0.0,0.0,0.0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [None]:
# importing libraries

import pandas as pd                                                                                     # to read csv, create and manipulate dataframe
from sklearn.model_selection import GridSearchCV, train_test_split                                      # GridSearchCV - to get best suited model among the given classification model,
                                                                                                        # train_test_split - to split the data into training and test set
from sklearn.preprocessing import StandardScaler                                                        # StandardScaler -  to center the data around mean zero and standard deviation 1
from sklearn.pipeline import Pipeline                                                                   # pipeline applies all the steps like encoding, imputation automatically
from sklearn.metrics import classification_report, mean_absolute_error, mean_squared_error, root_mean_squared_error             # to get the classification report, MSE, MAE, RMSE - different metrices
from sklearn.ensemble import RandomForestClassifier
import numpy as np                                                                                      # to Calculate RMSE, MSE will be converted to numpy datatype
# Load the dataset
df = pd.read_csv("/content/vectorised.csv")
df.head
# Split the features and target variable; Excluding the 1st column as it contains the index
X = df.iloc[:,1:-1]
y = df.iloc[:,-1]

# Spliting the data into train and test with the ration 80:20
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define classifiers and parameter grids
models = {

    'rf': (RandomForestClassifier(),
           {
                'model__n_estimators': [25, 50, 100, 200],
                'model__max_depth': [None, 5, 10, 15]
           }

    )
}

# Run GridSearchCV for each model and find the best suited model
best_models = {}
for name, (model, params) in models.items():
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('model', model)
    ])
    grid = GridSearchCV(pipeline, param_grid=params, cv=5, scoring='accuracy')
    grid.fit(X_train, y_train)
    best_models[name] = grid
    print(f"\nBest parameters for  {name}: {grid.best_params_}")
    print(f"Best cross-val accuracy: {grid.best_score_:.4f}")

# Evaluate the best model
best_model_name = max(best_models, key=lambda name: best_models[name].best_score_)
best_model = best_models[best_model_name]

# Validation over test data
y_pred = best_model.predict(X_test)

# Getting the best suited model name with classification report
print(f"\nEvaluating best model: {best_model_name.upper()}")
print(classification_report(y_test, y_pred))




Best parameters for  rf: {'model__max_depth': None, 'model__n_estimators': 25}
Best cross-val accuracy: 0.9400

Evaluating best model: RF
              precision    recall  f1-score   support

           0       0.88      1.00      0.93         7
           1       1.00      0.83      0.91         6

    accuracy                           0.92        13
   macro avg       0.94      0.92      0.92        13
weighted avg       0.93      0.92      0.92        13



In [None]:
import pandas as pd
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()

# Create a well-labeled confusion matrix DataFrame
cm_df = pd.DataFrame(
    [[tn, fp], [fn, tp]],
    index=pd.Index(['Actual Negative', 'Actual Positive']),
    columns=pd.Index(['Predicted Negative', 'Predicted Positive'])
)

# Display the confusion matrix
print("Confusion Matrix:\n")
print(cm_df)

Confusion Matrix:

                 Predicted Negative  Predicted Positive
Actual Negative                   7                   0
Actual Positive                   1                   5
