In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
with_alex = pd.read_csv('with_alex.csv')
without_alex = pd.read_csv('without_alex.csv')

with_alex.drop(columns=['matched_keywords', 'yes_no_Alexandra'], inplace=True)
without_alex.drop(columns=['matched_keywords'], inplace=True)

with_alex['yes_no_Yilang'].fillna(0, inplace=True)
without_alex['yes_no_Yilang'].fillna(0, inplace=True)

with_alex = with_alex.dropna()
without_alex = without_alex.dropna()

print(with_alex.shape, without_alex.shape)

without_alex_train = without_alex[without_alex['yes_no_Yilang'] == 1][:50]

without_alex = pd.concat([without_alex[without_alex['yes_no_Yilang'] == 1][50:], without_alex[without_alex['yes_no_Yilang'] == 0]])
print("New without alex trimmed: ", without_alex.shape)

train_phrases = list(without_alex_train['phrase'])
# Drop columns that don't match
common_columns = with_alex.columns.intersection(without_alex.columns)
with_alex = with_alex[common_columns]
without_alex = without_alex[common_columns]

print("end=", with_alex.shape, without_alex.shape)

# Join the dataframes
joined_df = pd.concat([with_alex, without_alex])
print("Joined df shape: ", joined_df.shape)

# Identify rows with NaN values
nan_rows = joined_df[joined_df.isna().any(axis=1)]
print("Rows with NaN values before dropping:")
print(nan_rows)
joined_df = joined_df.dropna()

print("Joined df shape: ", joined_df.shape)

(464, 3) (1394, 3)
New without alex trimmed:  (1344, 3)
end= (464, 3) (1344, 3)
Joined df shape:  (1808, 3)
Rows with NaN values before dropping:
Empty DataFrame
Columns: [phrase, yes_no_Yilang, candidate_number]
Index: []
Joined df shape:  (1808, 3)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  with_alex['yes_no_Yilang'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  without_alex['yes_no_Yilang'].fillna(0, inplace=True)


In [3]:
print("Joined df: ", joined_df.shape)

Joined df:  (1808, 3)


In [4]:
train_phrases

['we use components manufactured by other american companies whenever possible. we will continue to strive to support the american worker.',
 'on our trade agreements with other tions to receive their products at a lower cost. we need to stop our',
 'moccasins. we will not indicate on customs paperwork that the moccasins are a “gift” to avoid tariffs. intertiol shipments will',
 'america’ interview family run. american made…in the age of outsourcing to the lowest bidder, excel dryer remains committed to the',
 'run. american made…in the age of outsourcing to the lowest bidder, excel dryer remains committed to the great tradition of',
 "the highest standardsour safety shouldn't be outsourced. we're here to change that. united states mask manufactures ppe using only the",
 'own commercial kitchen. nothing is outsourced. this gives us unparalleled control over quality, starting at the farms and fisheries we',
 'a meeting anyone can outsource to suppliers. we do it all in-house.#inhousedes

In [14]:
from sentence_transformers import SentenceTransformer, util

# Load the pre-trained model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Encode the train phrases
train_phrases_embeddings = model.encode(train_phrases, convert_to_tensor=True)

# Encode the phrases in the joined_df
joined_df_phrases = joined_df['phrase'].dropna().tolist()
print("Number of phrases: ", len(joined_df_phrases))
joined_df_embeddings = model.encode(joined_df_phrases, convert_to_tensor=True)

# Calculate the cosine similarity between each train phrase and the phrases in joined_df
similarity_scores = util.pytorch_cos_sim(train_phrases_embeddings, joined_df_embeddings)

# Find the maximum similarity score for each train phrase and save it in a new column
max_similarity_scores = similarity_scores.max(dim=0).values

# Add the max similarity scores to the joined_df
joined_df['max_similarity'] = max_similarity_scores.cpu().numpy()

avg_similarity_scores = similarity_scores.mean(dim=0)

joined_df['average_similarity'] = avg_similarity_scores.cpu().numpy()

# Calculate the average of the train_phrases_embeddings
average_train_embedding = train_phrases_embeddings.mean(dim=0)
print("Average train embedding created")

# Calculate the cosine similarity between the average train embedding and the phrases in joined_df
average_similarity_scores = util.pytorch_cos_sim(average_train_embedding, joined_df_embeddings)
print("average_similarity_scores: ", average_similarity_scores.shape)

# Add the average similarity scores to the joined_df
joined_df['average_joint_similarity'] = average_similarity_scores[0].cpu().numpy()

print(joined_df.head())


Number of phrases:  1808
Average train embedding created
average_similarity_scores:  torch.Size([1, 1808])
                                              phrase  yes_no_Yilang  \
0  in our small workshop in pennsylvania. made in...            1.0   
1  by experienced american bedsmiths. the quality...            1.0   
2  supporting american manufacturing. *soft-side ...            1.0   
3  product safety commission are from chinese imp...            1.0   
4  our southern california factory. never settle ...            1.0   

   candidate_number  max_similarity  average_similarity  \
0                32        0.595803            0.355206   
1               199        0.660558            0.384834   
2               213        0.590401            0.379949   
3               399        0.620829            0.406283   
4               506        0.666501            0.406405   

   average_joint_similarity  
0                  0.561063  
1                  0.607861  
2                  0.

In [15]:
count_yes_no_yilang_1 = joined_df[joined_df['yes_no_Yilang'] == 1].shape[0]
print(count_yes_no_yilang_1)

145


In [16]:
from sklearn.metrics import precision_score, recall_score, accuracy_score

# Define the thresholds
thresholds = np.arange(0.3, 0.9, 0.1)
columns_to_evaluate = ['max_similarity', 'average_joint_similarity', 'average_similarity']

for column in columns_to_evaluate:
    print(f"Evaluating column: {column}")
    
    precision_values = []
    recall_values = []
    accuracy_values = []
    
    for threshold in thresholds:
        predicted_labels = (joined_df[column] >= threshold).astype(int)
        true_labels = joined_df['yes_no_Yilang'].fillna(0).astype(int)
        
        precision = precision_score(true_labels, predicted_labels)
        recall = recall_score(true_labels, predicted_labels)
        accuracy = accuracy_score(true_labels, predicted_labels)
        
        precision_values.append(precision)
        recall_values.append(recall)
        accuracy_values.append(accuracy)
    
    for threshold, precision, recall, accuracy in zip(thresholds, precision_values, recall_values, accuracy_values):
        print(f"Threshold: {threshold:.1f}, Precision: {precision:.2f}, Recall: {recall:.2f}, Accuracy: {accuracy:.2f}")

Evaluating column: max_similarity
Threshold: 0.3, Precision: 0.08, Recall: 1.00, Accuracy: 0.09
Threshold: 0.4, Precision: 0.08, Recall: 1.00, Accuracy: 0.12
Threshold: 0.5, Precision: 0.10, Recall: 0.90, Accuracy: 0.35
Threshold: 0.6, Precision: 0.16, Recall: 0.44, Accuracy: 0.77
Threshold: 0.7, Precision: 0.17, Recall: 0.05, Accuracy: 0.90
Threshold: 0.8, Precision: 0.38, Recall: 0.02, Accuracy: 0.92
Threshold: 0.9, Precision: 0.40, Recall: 0.01, Accuracy: 0.92
Evaluating column: average_joint_similarity
Threshold: 0.3, Precision: 0.08, Recall: 1.00, Accuracy: 0.10
Threshold: 0.4, Precision: 0.09, Recall: 1.00, Accuracy: 0.15
Threshold: 0.5, Precision: 0.10, Recall: 0.90, Accuracy: 0.36
Threshold: 0.6, Precision: 0.15, Recall: 0.63, Accuracy: 0.68
Threshold: 0.7, Precision: 0.21, Recall: 0.16, Accuracy: 0.88
Threshold: 0.8, Precision: 0.00, Recall: 0.00, Accuracy: 0.92
Threshold: 0.9, Precision: 0.00, Recall: 0.00, Accuracy: 0.92
Evaluating column: average_similarity
Threshold: 0.3, 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
