In [None]:
import pandas as pd
import json

# Load the Excel file
file_path = r'path.csv'
excel_file = pd.ExcelFile(file_path)

# Check if 'Responses' and 'Time Taken' sheets exist
if 'Responses' in excel_file.sheet_names and 'Time Taken' in excel_file.sheet_names:
    # Load the sheets
    df_responses = pd.read_excel(excel_file, 'Responses', header=0)
    df_timings = pd.read_excel(excel_file, 'Time Taken', header=0)

    # Combine the data into JSON format
    combined_data = []
    for i, row in df_responses.iterrows():
        combined_row = {}
        for col in df_responses.columns:
            # Match the corresponding column in the 'Time Taken' sheet
            timing_col = f"Q{df_responses.columns.get_loc(col) + 1} (seconds)"
            combined_row[col] = json.dumps({
                "responses": row[col],
                "time taken": df_timings.at[i, timing_col] if timing_col in df_timings.columns else None
            })
        combined_data.append(combined_row)

    # Convert to DataFrame
    combined_df = pd.DataFrame(combined_data)

    # Save the combined DataFrame to a new Excel file
    combined_df.to_excel('/contents/Response-time_format.xlsx', index=False)
    print("Combined data has been saved to 'Combined_Survey_Output.xlsx'")
else:
    print("Worksheet 'Responses' or 'Time Taken' not found in the file.")

**Preproccessing the data**

In [None]:
import pandas as pd
import json
from collections import Counter

# Corrected Data
data = {
    "Category": ["Rush"],
    "SUB(Q1)": ['{"response_length": 39, "time taken": 6.47}'],
    "MCQ(Q2)": ['{"responses": "b", "time taken": 19.23}'],
    "MMCQ(Q1)": ['{"responses": "b, c, a, d, e", "time taken": 24.93}'],
    "SUB(Q2)": ['{"response_length": 87, "time taken": 33.51}'],
    "MCQ(Q3)": ['{"responses": "c", "time taken": 75.48}'],
    "MMCQ(Q2)": ['{"responses": "a", "time taken": 76.6}'],
    "SUB(Q3)": ['{"response_length": 39, "time taken": 84.6}'],
    "MCQ(Q4)": ['{"responses": "c", "time taken": 110.52}'],
    "SUB(Q4)": ['{"response_length": 33, "time taken": 114.99}'],
    "MCQ(Q5)": ['{"responses": "b", "time taken": 138.23}'],
    "MMCQ(Q3)": ['{"responses": "c", "time taken": 144.2}'],
    "MCQ(Q6)": ['{"responses": "c", "time taken": 146.53}'],
    "SUB(Q5)": ['{"response_length": 47, "time taken": 149.92}'],
    "MMCQ(Q4)": ['{"responses": "b, a, c, e, d", "time taken": 162.32}'],
    "MCQ(Q7)": ['{"responses": "a", "time taken": 168.13}'],
    "SUB(Q6)": ['{"response_length": 2, "time taken": 172.03}'],
    "MMCQ(Q5)": ['{"responses": "b", "time taken": 176.82}'],
    "MCQ(Q8)": ['{"responses": "c", "time taken": 178.1}'],
    "SUB(Q7)": ['{"response_length": 88, "time taken": 184.83}'],
    "Non-attempt": [0],
}

df = pd.DataFrame(data)

# Function to calculate selection pattern, unique options, most common option, and diversity score
def calc_mcq_features(columns):
    selection_patterns = []
    unique_options_list = []
    most_common_options = []
    diversity_scores = []

    for _, row in df.iterrows():
        all_responses = []
        for col in columns:
            try:
                response_data = json.loads(row[col])
                responses = response_data["responses"].split(", ")
                all_responses.extend(responses)
            except (KeyError, json.JSONDecodeError):
                continue
        # Calculate features
        selection_patterns.append(" -> ".join(all_responses))
        unique_options = set(all_responses)
        unique_options_list.append(len(unique_options))
        most_common_option = Counter(all_responses).most_common(1)
        most_common_options.append(most_common_option[0][0] if most_common_option else None)
        diversity_scores.append(len(unique_options) / len(all_responses) if all_responses else 0)

    return selection_patterns, unique_options_list, most_common_options, diversity_scores

# MCQ/MMCQ columns
mcq_cols = [col for col in df.columns if col.startswith("MCQ") or col.startswith("MMCQ")]

# Add MCQ features
df["Selection_Pattern"], df["Unique_Options"], df["Most_Common_Option"], df["Diversity_Score"] = calc_mcq_features(mcq_cols)

# Function to calculate total time for MCQ/MMCQ questions
def calc_total_time(columns):
    total_times = []
    for _, row in df.iterrows():
        total_time = 0
        for col in columns:
            try:
                response_data = json.loads(row[col])
                time_taken = response_data["time taken"]
                total_time += time_taken
            except (KeyError, json.JSONDecodeError):
                continue
        total_times.append(total_time)
    return total_times

# Add total_Time_Per_Response column
df["Total_Time_Per_Response"] = calc_total_time(mcq_cols)

# Function to calculate subjective metrics (length and time)
def calc_subjective_metrics(columns):
    total_lengths = []
    total_times = []
    for _, row in df.iterrows():
        total_length = 0
        total_time = 0
        for col in columns:
            try:
                response_data = json.loads(row[col])
                total_length += response_data.get("response_length", 0)
                total_time += response_data.get("time taken", 0)
            except (KeyError, json.JSONDecodeError):
                continue
        total_lengths.append(total_length)
        total_times.append(total_time)
    return total_lengths, total_times

# Subjective question columns
sub_cols = [col for col in df.columns if col.startswith("SUB")]

# Add Total_Sub_Length and Total_Sub_Time columns
df['Total_Sub_Length'], df['Total_Sub_Time'] = calc_subjective_metrics(sub_cols)

# Calculate Avg_Typing_Speed (length per second)
df['Avg_Typing_Speed'] = df['Total_Sub_Length'] / df['Total_Sub_Time']

# Final dataframe
final_columns = ['Category', 'Selection_Pattern', 'Unique_Options', 'Most_Common_Option',
                 'Diversity_Score', 'Total_Time_Per_Response', 'Total_Sub_Length',
                 'Total_Sub_Time', 'Avg_Typing_Speed', 'Non-attempt']
final_df = df[final_columns]

# Display the final processed DataFrame
print(final_df)

# Save to Excel
output_file = "processed_data2.xlsx"
final_df.to_excel(output_file, index=False)

print(f"Data has been successfully saved to {output_file}")


  Category                                  Selection_Pattern  Unique_Options  \
0  Genuine  b -> b -> c -> a -> d -> e -> c -> a -> c -> b...               5   

  Most_Common_Option  Diversity_Score  Total_Time_Per_Response  \
0                  c             0.25                  1421.09   

   Total_Sub_Length  Total_Sub_Time  Avg_Typing_Speed  Non-attempt  
0               335          746.35          0.448851            0  
Data has been successfully saved to processed_data2.xlsx


**Loading the data**

In [None]:
import pandas as pd

# Load the dataset from the specified path
file_path = "/content/adjusted_dataset.csv"
data = pd.read_csv(file_path)

# Preview the dataset
print(data.head())

  Category                                  Selection_Pattern  Unique_Options  \
0  Genuine  b -> b -> c -> a -> d -> e -> c -> a -> c -> b...               5   
1     Rush  a -> a -> c -> d -> a -> e -> d -> c -> a -> b...               5   
2  Genuine  a -> d -> c -> a -> a -> b -> e -> d -> c -> b...               5   
3  Genuine  a -> d -> c -> a -> a -> b -> e -> d -> c -> b...               5   
4  Genuine  c -> a -> d -> e -> a -> d -> d -> e -> b -> d...               5   

  Most_Common_Option  Diversity_Score  Total_Time_Per_Response  \
0                  c           0.2500                     1421   
1                  a           0.3125                     1035   
2                  a           0.2500                     2045   
3                  a           0.2500                     2045   
4                  d           0.3125                     1113   

   Total_Sub_Length  Total_Sub_Time  Avg_Typing_Speed  Non-attempt  
0            746.35        0.448851          0.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pandas as pd

# Define features and target
X = data.drop('Category', axis=1)  # Drop the target column
y = data['Category']  # Target column

# Encode the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Convert non-numeric columns to numeric using one-hot encoding
X_encoded = pd.get_dummies(X, drop_first=True)

# Train-test split (keeping all data for training)
X_train, _, y_train, _ = train_test_split(X_encoded, y_encoded, test_size=0.001, random_state=42)

# Now X_train and y_train contain all the data for training


**Trainng and testing bgt model**

In [None]:
import pandas as pd
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import joblib

# Load your training data
data = pd.read_csv('/content/adjusted_dataset.csv')  # Replace with the path to your training data

# Define features and target
X = data.drop('Category', axis=1)  # Drop the target column
y = data['Category']  # Target column

# Encode the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Convert non-numeric columns to numeric using one-hot encoding
X_encoded = pd.get_dummies(X, drop_first=True)

# Train-test split (keeping all data for training)
X_train, _, y_train, _ = train_test_split(X_encoded, y_encoded, test_size=0.01, random_state=42)

# Initialize the GBT model
gbt_model = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric="mlogloss")

# Train the model using preprocessed training data
gbt_model.fit(X_train, y_train)

# Save the trained BGT model to a file
joblib.dump(gbt_model, 'bgt_model.pkl')  # Saves the model as 'bgt_model.pkl'
print("BGT model has been saved as 'bgt_model.pkl'.")

# Ask for the location URL of the Excel file containing the data to be tested
excel_url = input("Enter the location URL of the Excel file containing the data to be tested: ")

# Load the test data from the Excel file
test_data = pd.read_excel(excel_url, sheet_name='Sheet1')

# Prepare the test data for prediction
test_features = test_data[['Selection_Pattern', 'Unique_Options', 'Most_Common_Option', 'Diversity_Score', 'Total_Time_Per_Response', 'Total_Sub_Length', 'Total_Sub_Time', 'Avg_Typing_Speed', 'Non-attempt']]
test_features_encoded = pd.get_dummies(test_features, drop_first=True)

# Ensure the test data has the same columns as the training data
missing_cols = set(X_train.columns) - set(test_features_encoded.columns)
for col in missing_cols:
    test_features_encoded[col] = 0
test_features_encoded = test_features_encoded[X_train.columns]

# Predict using the trained model
y_pred_test = gbt_model.predict(test_features_encoded)
y_pred_proba = gbt_model.predict_proba(test_features_encoded)

# Print the predictions with probabilities
print("Predictions for the test data:")
for i in range(len(y_pred_test)):
    if y_pred_test[i] == 0:
        print(f"Row {i+1}: The survey was a rush attempt with probability {y_pred_proba[i][0] * 100:.2f}%")
    else:
        print(f"Row {i+1}: The survey was a genuine attempt with probability {y_pred_proba[i][1] * 100:.2f}%")


Parameters: { "use_label_encoder" } are not used.



BGT model has been saved as 'bgt_model.pkl'.
Enter the location URL of the Excel file containing the data to be tested: /content/processed_data2.xlsx
Predictions for the test data:
Row 1: The survey was a genuine attempt with probability 94.92%


**LOGICAL ANALYSIS**

In [None]:
import json
import pandas as pd
from sentence_transformers import SentenceTransformer, util

# Initialize the model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

def check_logical_connection(question1, question2, answer1, answer2, question_threshold=0.75, answer_threshold=0.5):
    """
    Determines logical connection between questions and their answers based on similarity thresholds.
    """
    # Encode questions and answers
    question1_embedding = model.encode(question1)
    question2_embedding = model.encode(question2)
    answer1_embedding = model.encode(answer1)
    answer2_embedding = model.encode(answer2)

    # Calculate cosine similarity for questions
    question_similarity = util.cos_sim(question1_embedding, question2_embedding).item()

    if question_similarity >= question_threshold:
        # If questions are similar, check the similarity of the answers
        answer_similarity = util.cos_sim(answer1_embedding, answer2_embedding).item()

        if answer_similarity >= answer_threshold:
            return True, question_similarity, answer_similarity  # Both questions and answers are logically connected

    return False, question_similarity, None

def main():
    """
    Main function to process the survey script, identify related questions,
    and analyze logical connections in the responses.
    """
    # Prompt user to enter survey script in JSON format
    survey_script = input("Enter the survey script in JSON format: ")
    survey_data = json.loads(survey_script)

    # Extract questions and options
    questions = {q['id']: q['questionText'] for q in survey_data['questions']}
    options = {q['id']: q['options'] for q in survey_data['questions']}

    # Print related questions based on question similarity
    print("Related Questions with Similarity Scores:")
    for q1_id, q1_text in questions.items():
        for q2_id, q2_text in questions.items():
            if q1_id != q2_id:
                _, question_similarity, _ = check_logical_connection(q1_text, q2_text, "", "")
                if question_similarity >= 0.75:  # Adjust threshold as needed
                    print(f"Questions '{q1_text}' and '{q2_text}' have a similarity score of {question_similarity:.2f}")

    # Load survey responses from an Excel file
    excel_url = input("Enter the location URL of the Excel file containing the responses: ")
    df = pd.read_excel(excel_url, sheet_name='Responses')

    # Compare logical connections for each row in the survey data
    for i, row in df.iterrows():
        for q1_id, q1_text in questions.items():
            for q2_id, q2_text in questions.items():
                if q1_id != q2_id:
                    answer1 = row[q1_text] if pd.notna(row[q1_text]) else ""
                    answer2 = row[q2_text] if pd.notna(row[q2_text]) else ""
                    is_connected, question_similarity, answer_similarity = check_logical_connection(q1_text, q2_text, answer1, answer2)
                    if is_connected:
                        print(f"Row {i+1}: Questions '{q1_text}' and '{q2_text}' are logically connected with question similarity {question_similarity:.2f} and answer similarity {answer_similarity:.2f}.")

if __name__ == "__main__":
    main()


Enter the survey script in JSON format: {   "title": "Survey - 2 (AI)",   "questions": [     {       "id": "question-1",       "type": "mcq",       "mandatory": true,       "questionText": "Is this a genuine attempt or a rush attempt?",       "options": [         "Genuine Attempt",         "Rush Attempt"       ]     },     {       "id": "question-2",       "type": "subjective",       "mandatory": true,       "questionText": "What do you think is the most exciting application of AI in today's world?",       "options": []     },     {       "id": "question-3",       "type": "mcq",       "mandatory": true,       "questionText": "Which area of AI do you think will grow the most in the next 5 years?",       "options": [         "Healthcare",         "Education",         "Finance",         "Entertainment",         "Transportation"       ]     },     {       "id": "question-4",       "type": "multi-select",       "mandatory": false,       "questionText": "Which challenges of AI development co

**BGT+LOGICAL FINAL MODEL**

In [None]:
import json
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import pickle
import numpy as np
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
import joblib

# Initialize the SentenceTransformer model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Load the BGT model from the saved pickle file
with open('/content/bgt_model.pkl', 'rb') as f:
    bgt_model = pickle.load(f)

def check_logical_connection(question1, question2, answer1, answer2, question_threshold=0.75, answer_threshold=0.5):
    """
    Determines logical connection between questions and their answers based on similarity thresholds.
    """
    # Encode questions and answers
    question1_embedding = model.encode(question1)
    question2_embedding = model.encode(question2)
    answer1_embedding = model.encode(answer1)
    answer2_embedding = model.encode(answer2)

    # Calculate cosine similarity for questions
    question_similarity = util.cos_sim(question1_embedding, question2_embedding).item()

    if question_similarity >= question_threshold:
        # If questions are similar, check the similarity of the answers
        answer_similarity = util.cos_sim(answer1_embedding, answer2_embedding).item()

        if answer_similarity >= answer_threshold:
            return True, question_similarity, answer_similarity  # Both questions and answers are logically connected

    return False, question_similarity, None

def calculate_logical_connection_score(questions, options, df):
    """
    Calculates the logical connection score based on the number of connected questions.
    """
    connected_count = 0
    total_comparisons = 0

    for q1_id, q1_text in questions.items():
        for q2_id, q2_text in questions.items():
            if q1_id != q2_id:
                total_comparisons += 1
                for i, row in df.iterrows():
                    answer1 = row[q1_text] if pd.notna(row[q1_text]) else ""
                    answer2 = row[q2_text] if pd.notna(row[q2_text]) else ""
                    is_connected, _, _ = check_logical_connection(q1_text, q2_text, answer1, answer2)
                    if is_connected:
                        connected_count += 1

    logical_connection_score = (connected_count / total_comparisons) if total_comparisons > 0 else 0
    return logical_connection_score

def preprocess_data(data):
    """
    Preprocesses the data by encoding categorical variables and handling missing values.
    """
    # Define features and target
    X = data.drop('Category', axis=1)  # Drop the target column
    y = data['Category']  # Target column

    # Encode the target variable
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)

    # Convert non-numeric columns to numeric using one-hot encoding
    X_encoded = pd.get_dummies(X, drop_first=True)

    return X_encoded, y_encoded

def main():
    """
    Main function to process the survey script, identify related questions,
    and analyze logical connections in the responses.
    """
    # Prompt user to enter survey script in JSON format
    survey_script = input("Enter the survey script in JSON format: ")
    survey_data = json.loads(survey_script)

    # Extract questions and options
    questions = {q['id']: q['questionText'] for q in survey_data['questions']}
    options = {q['id']: q['options'] for q in survey_data['questions']}

    # Print related questions based on question similarity
    print("Related Questions with Similarity Scores:")
    for q1_id, q1_text in questions.items():
        for q2_id, q2_text in questions.items():
            if q1_id != q2_id:
                _, question_similarity, _ = check_logical_connection(q1_text, q2_text, "", "")
                if question_similarity >= 0.75:  # Adjust threshold as needed
                    print(f"Questions '{q1_text}' and '{q2_text}' have a similarity score of {question_similarity:.2f}")

    # Load survey responses from an Excel file (for logical connection analysis)
    logical_excel_url = input("Enter the location URL of the Excel file containing the responses (for logical connection analysis): ")
    logical_df = pd.read_excel(logical_excel_url, sheet_name='Responses')

    # Calculate logical connection score
    logical_connection_score = calculate_logical_connection_score(questions, options, logical_df)
    print(f"Logical Connection Score: {logical_connection_score:.2f}")

    # Load BGT Excel file
    bgt_excel_url = input("Enter the location URL of the Excel file for BGT model analysis: ")
    bgt_df = pd.read_excel(bgt_excel_url, sheet_name='Sheet1')

    # Preprocess the BGT data
    bgt_features = bgt_df[['Selection_Pattern', 'Unique_Options', 'Most_Common_Option', 'Diversity_Score', 'Total_Time_Per_Response', 'Total_Sub_Length', 'Total_Sub_Time', 'Avg_Typing_Speed', 'Non-attempt']]
    bgt_features_encoded = pd.get_dummies(bgt_features, drop_first=True)

    # Ensure the BGT data has the same columns as the training data
    missing_cols = set(X_train.columns) - set(bgt_features_encoded.columns)
    for col in missing_cols:
        bgt_features_encoded[col] = 0
    bgt_features_encoded = bgt_features_encoded[X_train.columns]

    # Predict using the trained BGT model
    bgt_prediction = bgt_model.predict(bgt_features_encoded)
    bgt_proba = bgt_model.predict_proba(bgt_features_encoded)

    # Print the predictions with probabilities
    print("Predictions for the test data:")
    for i in range(len(bgt_prediction)):
        if bgt_prediction[i] == 0:
            print(f"Row {i+1}: The survey was a genuine attempt with probability {bgt_proba[i][0] * 100:.2f}%")
        else:
            print(f"Row {i+1}: The survey was a rush attempt with probability {bgt_proba[i][1] * 100:.2f}%")

    # Calculate BGT model score
    bgt_score = np.mean(bgt_prediction)  # Assuming the prediction is a binary classification
    print(f"BGT Model Score: {bgt_score:.2f}")

    # Combine scores using weighted average (20% logical connection, 80% BGT model)
    final_score = (0.3 * logical_connection_score) + (0.7 * bgt_score)
    print(f"Final Score: {final_score:.2f}")

if __name__ == "__main__":
    main()


Enter the survey script in JSON format: {   "title": "Survey - 2 (AI)",   "questions": [     {       "id": "question-1",       "type": "mcq",       "mandatory": true,       "questionText": "Is this a genuine attempt or a rush attempt?",       "options": [         "Genuine Attempt",         "Rush Attempt"       ]     },     {       "id": "question-2",       "type": "subjective",       "mandatory": true,       "questionText": "What do you think is the most exciting application of AI in today's world?",       "options": []     },     {       "id": "question-3",       "type": "mcq",       "mandatory": true,       "questionText": "Which area of AI do you think will grow the most in the next 5 years?",       "options": [         "Healthcare",         "Education",         "Finance",         "Entertainment",         "Transportation"       ]     },     {       "id": "question-4",       "type": "multi-select",       "mandatory": false,       "questionText": "Which challenges of AI development co