In [44]:
import pandas as pd
import json

CSV_FILE = 'test_results_july3.csv'

# Define the field names
fieldnames = [
    'section', 'dataset', 'embedder', 'chunking_detail', 'timestamp', 'extra_info', 'device', 'evalmodel', 'Prompt', 'ModelName',
    'Temperature', 'TopK', 'SimilarityThresholdDocuments',
    'SimilarityThresholdQuestions', 'runId', 'category',
    'TFIDFScore', 'ResponseTime', 'answer_correctness',
    'faithfulness', 'answer_similarity', 'answer_relevancy', 'context_precision',
    'context_relevancy', 'context_recall', 'response_json'
]

# Load the CSV file
df = pd.read_csv(CSV_FILE)
# df= df[(df['chunking_detail'] == 'no_RAG')]
# Parse the 'response_json' to extract the 'answer' field
def extract_answer(json_str):
    try:
        response = json.loads(json_str)
        return response.get('answer', '')
    except json.JSONDecodeError:
        return ''

# Create a new column with the extracted answers
df['extracted_answer'] = df['response_json'].apply(extract_answer)

# Log how many answers contain "I don't know"
num_dont_know = df['extracted_answer'].str.contains("I don't know", case=False).sum()
print(f"Number of answers containing 'I don't know': {num_dont_know}")



Number of answers containing 'I don't know': 300


In [5]:
import pandas as pd
         
CSV_FILE = 'test_results.csv'

fieldnames = [
    'section', 'dataset', 'embedder', 'chunking_detail', 'timestamp', 'extra_info', 'device', 'evalmodel', 'Prompt', 'ModelName',
    'Temperature', 'TopK', 'SimilarityThresholdDocuments',
    'SimilarityThresholdQuestions', 'runId', 'category',
    'TFIDFScore', 'ResponseTime', 'answer_correctness',
    'faithfulness', 'answer_similarity', 'answer_relevancy', 'context_precision',
    'context_relevancy', 'context_recall', 'response_json'
]

# Read the CSV file into a DataFrame
df = pd.read_csv(CSV_FILE, usecols=fieldnames)

# Ensure numeric columns are indeed numeric
numeric_cols = [
    'TFIDFScore', 'ResponseTime', 'answer_similarity', 'answer_correctness'
]

# Convert the specified columns to numeric, coercing errors to NaN
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')

# Drop rows with NaN values in numeric columns to avoid issues during aggregation
df = df.dropna(subset=numeric_cols)

# df_filtered = df[(df['section'] == 'General') & (df['chunking_detail'] != 'no_RAG')]
df_filtered = df[(df['section'] == 'General')]

# Group by both 'ModelName' and 'chunking_detail' and calculate the mean
comparison_df = df_filtered.groupby(['ModelName', 'chunking_detail'])[numeric_cols].mean().reset_index()

print(comparison_df)


             ModelName chunking_detail  TFIDFScore  ResponseTime  \
0            gemma2:9b       1000_20_3    0.480896   2242.114147   
1            gemma2:9b          no_RAG    0.382859   1656.865953   
2   gpt-3.5-turbo-0125       1000_20_3    0.497467   3952.746252   
3   gpt-3.5-turbo-0125          no_RAG    0.465772   1246.909274   
4               gpt-4o       1000_20_3    0.524822   5148.209342   
5               gpt-4o    assistantAPI    0.512130   9099.639243   
6               gpt-4o          no_RAG    0.454131   2245.967796   
7           llama3:70b       1000_20_3    0.510852   7446.358219   
8           llama3:70b          no_RAG    0.446166   7594.727234   
9            llama3:8b       1000_20_3    0.474095   1999.263052   
10           llama3:8b          no_RAG    0.416392   1484.852913   
11            phi3:14b       1000_20_3    0.459565   2679.640484   
12            phi3:14b          no_RAG    0.395021   2987.052896   
13           phi3:3.8b       1000_20_3    0.4279

In [51]:
import pandas as pd
import numpy as np

CSV_FILE = 'test_results_july3.csv'

fieldnames = [
    'section', 'dataset', 'embedder', 'chunking_detail', 'timestamp', 'extra_info', 'device', 'evalmodel', 'Prompt', 'ModelName',
    'Temperature', 'TopK', 'SimilarityThresholdDocuments',
    'SimilarityThresholdQuestions', 'runId', 'category',
    'TFIDFScore', 'ResponseTime', 'answer_correctness',
    'faithfulness', 'answer_similarity', 'answer_relevancy', 'context_precision',
    'context_relevancy', 'context_recall', 'response_json'
]

# Read the CSV file into a DataFrame
df = pd.read_csv(CSV_FILE, usecols=fieldnames)

# Ensure numeric columns are indeed numeric
numeric_cols = [
    'TFIDFScore', 'ResponseTime', 'answer_similarity', 'answer_correctness'
]

# Convert the specified columns to numeric, coercing errors to NaN
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')

# Drop rows with NaN values in numeric columns to avoid issues during aggregation
df = df.dropna(subset=numeric_cols)

# Filter the DataFrame for 'General' section
df_filtered = df[df['section'] == 'General']

# Calculate mean, standard deviation, and RMSE for the specified metrics
df_filtered['TFIDFScore_std'] = df_filtered.groupby(['ModelName', 'chunking_detail'])['TFIDFScore'].transform('std')
df_filtered['answer_similarity_std'] = df_filtered.groupby(['ModelName', 'chunking_detail'])['answer_similarity'].transform('std')
df_filtered['answer_correctness_std'] = df_filtered.groupby(['ModelName', 'chunking_detail'])['answer_correctness'].transform('std')

df_filtered['TFIDFScore_rmse'] = df_filtered.groupby(['ModelName', 'chunking_detail'])['TFIDFScore'].transform(lambda x: np.sqrt(np.mean((x - x.mean())**2)))
df_filtered['answer_similarity_rmse'] = df_filtered.groupby(['ModelName', 'chunking_detail'])['answer_similarity'].transform(lambda x: np.sqrt(np.mean((x - x.mean())**2)))
df_filtered['answer_correctness_rmse'] = df_filtered.groupby(['ModelName', 'chunking_detail'])['answer_correctness'].transform(lambda x: np.sqrt(np.mean((x - x.mean())**2)))

comparison_df = df_filtered.groupby(['ModelName', 'chunking_detail'])[numeric_cols + [
    'TFIDFScore_std', 'answer_similarity_std', 'answer_correctness_std',
    'TFIDFScore_rmse', 'answer_similarity_rmse', 'answer_correctness_rmse'
]].mean().reset_index()

# Calculate weighted score and average distribution of the three metrics
comparison_df['WeightedScore'] = comparison_df.apply(
    lambda row: (row['TFIDFScore'] + row['answer_similarity'] + row['answer_correctness']) / 3, axis=1
)
comparison_df['AverageDistribution'] = comparison_df[['TFIDFScore', 'answer_similarity', 'answer_correctness']].mean(axis=1)

print(comparison_df)


             ModelName chunking_detail  TFIDFScore  ResponseTime  \
0            gemma2:9b       1000_20_3    0.484935   2192.673441   
1            gemma2:9b          no_RAG    0.399836    982.750980   
2   gpt-3.5-turbo-0125       1000_20_3    0.491482   1985.400037   
3   gpt-3.5-turbo-0125    assistantAPI    0.511297   4805.140309   
4               gpt-4o       1000_20_3    0.518271   3540.949103   
5           llama3:70b       1000_20_3    0.501360   7725.311983   
6           llama3:70b          no_RAG    0.434916   5722.910952   
7            llama3:8b       1000_20_3    0.474585   2035.047739   
8            llama3:8b          no_RAG    0.425080   1640.433754   
9             phi3:14b       1000_20_3    0.462092   3119.280640   
10           phi3:3.8b       1000_20_3    0.419408   2014.021480   
11           phi3:3.8b          no_RAG    0.397382   1356.387053   

    answer_similarity  answer_correctness  TFIDFScore_std  \
0            0.828361            0.591821        0.189

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['TFIDFScore_std'] = df_filtered.groupby(['ModelName', 'chunking_detail'])['TFIDFScore'].transform('std')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['answer_similarity_std'] = df_filtered.groupby(['ModelName', 'chunking_detail'])['answer_similarity'].transform('std')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/inde

In [19]:
import pandas as pd

CSV_FILE = 'test_results.csv'

fieldnames = [
    'section', 'dataset', 'embedder', 'chunking_detail', 'timestamp', 'extra_info', 'device', 'evalmodel', 'Prompt', 'ModelName',
    'Temperature', 'TopK', 'SimilarityThresholdDocuments',
    'SimilarityThresholdQuestions', 'runId', 'category',
    'TFIDFScore', 'ResponseTime', 'answer_correctness',
    'faithfulness', 'answer_similarity', 'answer_relevancy', 'context_precision',
    'context_relevancy', 'context_recall', 'response_json'
]

# Read the CSV file into a DataFrame
df = pd.read_csv(CSV_FILE, usecols=fieldnames)

# Update the chunking_detail for the specified runIds
df.loc[df['runId'] == 1721458822, 'chunking_detail'] = 'assistantAPI'
df.loc[df['runId'] == 1721494959, 'chunking_detail'] = 'no_RAG'

# Save the modified DataFrame back to the CSV file
df.to_csv(CSV_FILE, index=False)


In [48]:
import pandas as pd
import numpy as np

CSV_FILE = 'test_results.csv'

fieldnames = [
    'section', 'dataset', 'embedder', 'chunking_detail', 'timestamp', 'extra_info', 'device', 'evalmodel', 'Prompt', 'ModelName',
    'Temperature', 'TopK', 'SimilarityThresholdDocuments',
    'SimilarityThresholdQuestions', 'runId', 'category',
    'TFIDFScore', 'ResponseTime', 'answer_correctness',
    'faithfulness', 'answer_similarity', 'answer_relevancy', 'context_precision',
    'context_relevancy', 'context_recall', 'response_json'
]

# Read the CSV file into a DataFrame
df = pd.read_csv(CSV_FILE, usecols=fieldnames)

# Ensure numeric columns are indeed numeric
numeric_cols = [
    'TFIDFScore', 'ResponseTime', 'answer_similarity', 'answer_correctness'
]

# Convert the specified columns to numeric, coercing errors to NaN
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')


# Assuming df and numeric_cols are defined
df = df.dropna(subset=numeric_cols)

df_filtered = df[(df['section'] != 'General')]

# Calculate mean, standard deviation, and RMSE for the specified metrics
df_filtered['TFIDFScore_std'] = df_filtered.groupby(['ModelName', 'chunking_detail'])['TFIDFScore'].transform('std')
df_filtered['answer_similarity_std'] = df_filtered.groupby(['ModelName', 'chunking_detail'])['answer_similarity'].transform('std')
df_filtered['answer_correctness_std'] = df_filtered.groupby(['ModelName', 'chunking_detail'])['answer_correctness'].transform('std')

df_filtered['TFIDFScore_rmse'] = df_filtered.groupby(['ModelName', 'chunking_detail'])['TFIDFScore'].transform(lambda x: np.sqrt(np.mean((x - x.mean())**2)))
df_filtered['answer_similarity_rmse'] = df_filtered.groupby(['ModelName', 'chunking_detail'])['answer_similarity'].transform(lambda x: np.sqrt(np.mean((x - x.mean())**2)))
df_filtered['answer_correctness_rmse'] = df_filtered.groupby(['ModelName', 'chunking_detail'])['answer_correctness'].transform(lambda x: np.sqrt(np.mean((x - x.mean())**2)))

comparison_df = df_filtered.groupby(['ModelName', 'chunking_detail', 'runId'])[numeric_cols + [
    'TFIDFScore_std', 'answer_similarity_std', 'answer_correctness_std',
    'TFIDFScore_rmse', 'answer_similarity_rmse', 'answer_correctness_rmse'
]].mean().reset_index()

# Calculate weighted score and average distribution of the three metrics
comparison_df['AverageScore'] = comparison_df.apply(
    lambda row: (row['TFIDFScore'] + row['answer_similarity'] + row['answer_correctness']) / 3, axis=1
)
comparison_df['AverageStd'] = comparison_df[['TFIDFScore_std', 'answer_similarity_std', 'answer_correctness_std']].mean(axis=1)
comparison_df['AverageRMSE'] = comparison_df[['TFIDFScore_rmse', 'answer_similarity_rmse', 'answer_correctness_rmse']].mean(axis=1)

print(comparison_df)



             ModelName chunking_detail       runId  TFIDFScore  ResponseTime  \
0            gemma2:9b       1000_20_3  1720708303    0.425435   1759.735881   
1            gemma2:9b          no_RAG  1720649180    0.238822   1340.057940   
2   gpt-3.5-turbo-0125       1000_20_3  1720675512    0.435688   1806.864813   
3   gpt-3.5-turbo-0125       1000_20_3  1720774209    0.396763   2855.950499   
4   gpt-3.5-turbo-0125          no_RAG  1720802607    0.304090   1078.918457   
5               gpt-4o       1000_20_3  1720690030    0.448099   3730.162665   
6               gpt-4o    assistantAPI  1720826113    0.460103   7660.685264   
7               gpt-4o          no_RAG  1720817390    0.275471   1699.062331   
8          gpt-4o-mini       1000_20_3  1721448144    0.430777   2487.343407   
9          gpt-4o-mini    assistantAPI  1721458822    0.450224   9018.972574   
10         gpt-4o-mini    assistantAPI  1721605211    0.449528   7280.107872   
11         gpt-4o-mini          no_RAG  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['TFIDFScore_std'] = df_filtered.groupby(['ModelName', 'chunking_detail'])['TFIDFScore'].transform('std')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['answer_similarity_std'] = df_filtered.groupby(['ModelName', 'chunking_detail'])['answer_similarity'].transform('std')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/inde

In [36]:
import pandas as pd
import numpy as np

CSV_FILE = 'test_results.csv'

fieldnames = [
    'section', 'dataset', 'embedder', 'chunking_detail', 'timestamp', 'extra_info', 'device', 'evalmodel', 'Prompt', 'ModelName',
    'Temperature', 'TopK', 'SimilarityThresholdDocuments',
    'SimilarityThresholdQuestions', 'runId', 'category',
    'TFIDFScore', 'ResponseTime', 'answer_correctness',
    'faithfulness', 'answer_similarity', 'answer_relevancy', 'context_precision',
    'context_relevancy', 'context_recall', 'response_json'
]

# Read the CSV file into a DataFrame
df = pd.read_csv(CSV_FILE, usecols=fieldnames)

# Ensure numeric columns are indeed numeric
numeric_cols = [
    'TFIDFScore', 'ResponseTime', 'answer_similarity', 'answer_correctness'
]

# Convert the specified columns to numeric, coercing errors to NaN
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')

# Drop rows with NaN values in numeric columns
df = df.dropna(subset=numeric_cols)

# Filter out the section 'General'
df_filtered = df[df['section'] != 'General']

# Replace 'assistantAPI' with '1000_20_3' in chunking_detail
df_filtered['chunking_detail'] = df_filtered['chunking_detail'].replace('assistantAPI', '1000_20_3')

# Calculate mean, standard deviation, and RMSE for the specified metrics
df_filtered['TFIDFScore_std'] = df_filtered.groupby(['ModelName', 'chunking_detail'])['TFIDFScore'].transform('std')
df_filtered['answer_similarity_std'] = df_filtered.groupby(['ModelName', 'chunking_detail'])['answer_similarity'].transform('std')
df_filtered['answer_correctness_std'] = df_filtered.groupby(['ModelName', 'chunking_detail'])['answer_correctness'].transform('std')

df_filtered['TFIDFScore_rmse'] = df_filtered.groupby(['ModelName', 'chunking_detail'])['TFIDFScore'].transform(lambda x: np.sqrt(np.mean((x - x.mean())**2)))
df_filtered['answer_similarity_rmse'] = df_filtered.groupby(['ModelName', 'chunking_detail'])['answer_similarity'].transform(lambda x: np.sqrt(np.mean((x - x.mean())**2)))
df_filtered['answer_correctness_rmse'] = df_filtered.groupby(['ModelName', 'chunking_detail'])['answer_correctness'].transform(lambda x: np.sqrt(np.mean((x - x.mean())**2)))

# Group by chunking_detail to compare "no_RAG" vs. the rest
comparison_df = df_filtered.groupby(['chunking_detail'])[numeric_cols + [
    'TFIDFScore_std', 'answer_similarity_std', 'answer_correctness_std',
    'TFIDFScore_rmse', 'answer_similarity_rmse', 'answer_correctness_rmse'
]].mean().reset_index()

# Calculate weighted score and average distribution of the three metrics
comparison_df['AverageScore'] = comparison_df.apply(
    lambda row: (row['TFIDFScore'] + row['answer_similarity'] + row['answer_correctness']) / 3, axis=1
)
comparison_df['AverageStd'] = comparison_df[['TFIDFScore_std', 'answer_similarity_std', 'answer_correctness_std']].mean(axis=1)
comparison_df['AverageRMSE'] = comparison_df[['TFIDFScore_rmse', 'answer_similarity_rmse', 'answer_correctness_rmse']].mean(axis=1)
print(comparison_df)


  chunking_detail  TFIDFScore  ResponseTime  answer_similarity  \
0       1000_20_3    0.421082   3907.945034           0.719911   
1          no_RAG    0.276106   2133.931742           0.639760   

   answer_correctness  TFIDFScore_std  answer_similarity_std  \
0            0.525399        0.177606               0.150208   
1            0.320204        0.164997               0.179456   

   answer_correctness_std  TFIDFScore_rmse  answer_similarity_rmse  \
0                0.216941         0.176288                0.149050   
1                0.191732         0.163137                0.177434   

   answer_correctness_rmse  AverageScore  AverageStd  AverageRMSE  
0                 0.215302      0.555464    0.181585     0.180213  
1                 0.189572      0.412023    0.178728     0.176714  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['chunking_detail'] = df_filtered['chunking_detail'].replace('assistantAPI', '1000_20_3')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['TFIDFScore_std'] = df_filtered.groupby(['ModelName', 'chunking_detail'])['TFIDFScore'].transform('std')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-ver

In [42]:
import pandas as pd
import numpy as np
import json 

CSV_FILE = 'test_results_july3.csv'

fieldnames = [
    'section', 'dataset', 'embedder', 'chunking_detail', 'timestamp', 'extra_info', 'device', 'evalmodel', 'Prompt', 'ModelName',
    'Temperature', 'TopK', 'SimilarityThresholdDocuments',
    'SimilarityThresholdQuestions', 'runId', 'category',
    'TFIDFScore', 'ResponseTime', 'answer_correctness',
    'faithfulness', 'answer_similarity', 'answer_relevancy', 'context_precision',
    'context_relevancy', 'context_recall', 'response_json'
]

# Read the CSV file into a DataFrame
df = pd.read_csv(CSV_FILE, usecols=fieldnames)

# Ensure numeric columns are indeed numeric
numeric_cols = [
    'TFIDFScore', 'ResponseTime', 'answer_similarity', 'answer_correctness'
]

# Convert the specified columns to numeric, coercing errors to NaN
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')

# Parse the 'response_json' to extract the 'answer' field
def extract_answer(json_str):
    try:
        response = json.loads(json_str)
        return response.get('answer', '')
    except json.JSONDecodeError:
        return ''

# Create a new column with the extracted answers
df['extracted_answer'] = df['response_json'].apply(extract_answer)

# Filter the DataFrame for 'General' section
df_filtered = df[(df['chunking_detail'] != 'no_RAG')]

# Group by ModelName and chunking_detail and calculate mean scores before setting to 0
before_scores = df_filtered.groupby(['ModelName', 'chunking_detail'])[numeric_cols].mean().reset_index()

# Count the number of "I don't know" responses
df_filtered['idk_count'] = df_filtered['extracted_answer'].str.contains("I don't know", case=False).astype(int)
idk_count_df = df_filtered.groupby(['ModelName', 'chunking_detail'])['idk_count'].sum().reset_index()

# Set scores to 0 if the answer contains "I don't know"
condition = df_filtered['extracted_answer'].str.contains("I don't know", case=False)
df_filtered.loc[condition, numeric_cols] = 0

# Drop rows with NaN values in numeric columns to avoid issues during aggregation
df_filtered = df_filtered.dropna(subset=numeric_cols)

# Group by ModelName and chunking_detail and calculate mean scores after setting to 0
after_scores = df_filtered.groupby(['ModelName', 'chunking_detail'])[numeric_cols].mean().reset_index()

# Calculate weighted score for before and after
before_scores['WeightedScore'] = before_scores.apply(
    lambda row: (row['TFIDFScore'] + row['answer_similarity'] + row['answer_correctness']) / 3, axis=1
)
after_scores['WeightedScore'] = after_scores.apply(
    lambda row: (row['TFIDFScore'] + row['answer_similarity'] + row['answer_correctness']) / 3, axis=1
)

# Merge idk_count with before and after scores
comparison_df = before_scores.merge(after_scores, on=['ModelName', 'chunking_detail'], suffixes=('_before', '_after'))
comparison_df = comparison_df.merge(idk_count_df, on=['ModelName', 'chunking_detail'])

# Display the final DataFrame
print(comparison_df)

# The resulting DataFrame will have the counts of "I don't know" responses and the before and after scores


            ModelName chunking_detail  TFIDFScore_before  ResponseTime_before  \
0           gemma2:9b       1000_20_3           0.469152          1734.553026   
1  gpt-3.5-turbo-0125       1000_20_3           0.493496          1580.045041   
2  gpt-3.5-turbo-0125    assistantAPI           0.501313          4785.197028   
3              gpt-4o       1000_20_3           0.507940          3475.761402   
4         gpt-4o-mini       1000_20_3           0.532322          2044.048836   
5          llama3:70b       1000_20_3           0.466509          5813.427644   
6           llama3:8b       1000_20_3           0.440145          1757.937967   
7            phi3:14b       1000_20_3           0.456959          3155.549485   
8           phi3:3.8b       1000_20_3           0.431111          1807.065723   

   answer_similarity_before  answer_correctness_before  WeightedScore_before  \
0                  0.781595                   0.574824              0.608524   
1                  0.830967  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['idk_count'] = df_filtered['extracted_answer'].str.contains("I don't know", case=False).astype(int)
