In [44]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
import numpy as np

# Load the dataset
df = pd.read_csv('/content/prototype.csv')
df['emb'] = df['emb'].apply(ast.literal_eval)
# Shuffle the dataframe
df = df.sample(frac=1, random_state=42)

# Separate features (embeddings) and labels
X = df['emb'].tolist()
y = df['modified']

# Convert the sequence of embeddings to a single vector
X_transformed = []
for emb_vec in X:
    vec_sum = np.mean(emb_vec[:5], axis=0)
    X_transformed.append(vec_sum)

# Reshape the feature array
X_transformed = np.array(X_transformed).reshape(-1, 1)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=42)

# Train the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = model.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Prototype Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1-score: {f1:.2f}')


Prototype Accuracy: 0.90
Precision: 0.90
Recall: 1.00
F1-score: 0.95


In [50]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
import numpy as np

# Load the dataset
df = pd.read_csv('/content/nonprototype.csv')
df['emb'] = df['emb'].apply(ast.literal_eval)
# Shuffle the dataframe
df = df.sample(frac=1, random_state=42)

# Separate features (embeddings) and labels
X = df['emb'].tolist()
y = df['modified']

# Convert the sequence of embeddings to a single vector
X_transformed = []
for emb_vec in X:
    vec_sum = np.mean(emb_vec[:5], axis=0)
    X_transformed.append(vec_sum)

# Reshape the feature array
X_transformed = np.array(X_transformed).reshape(-1, 1)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=42)

# Train the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = model.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'NONPrototype Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1-score: {f1:.2f}')


NONPrototype Accuracy: 0.90
Precision: 0.90
Recall: 1.00
F1-score: 0.95


In [54]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

# Load the dataset
df = pd.read_csv('/content/singleton.csv')
df['emb'] = df['emb'].apply(ast.literal_eval)

# Shuffle the dataframe
df = df.sample(frac=1, random_state=42)

# Separate features (embeddings) and labels
X = df['emb'].tolist()
y = df['modified']

# Convert the sequence of embeddings to a single vector
X_transformed = []
for emb_vec in X:
    vec_sum = np.mean(emb_vec[:5], axis=0)
    X_transformed.append(vec_sum)

# Reshape the feature array
X_transformed = np.array(X_transformed).reshape(-1, 1)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=42)

# Apply SMOTE to the training set
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Scale the feature vectors
scaler = StandardScaler()
X_train_resampled = scaler.fit_transform(X_train_resampled)
X_test = scaler.transform(X_test)

# Train the logistic regression model
model = LogisticRegression()
model.fit(X_train_resampled, y_train_resampled)

# Make predictions on the testing set
y_pred = model.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Singleton Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1-score: {f1:.2f}')


Singleton Accuracy: 0.50
Precision: 0.50
Recall: 0.60
F1-score: 0.55


In [55]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

# Load the dataset
df = pd.read_csv('/content/nonsingleton.csv')
df['emb'] = df['emb'].apply(ast.literal_eval)

# Shuffle the dataframe
df = df.sample(frac=1, random_state=42)

# Separate features (embeddings) and labels
X = df['emb'].tolist()
y = df['modified']

# Convert the sequence of embeddings to a single vector
X_transformed = []
for emb_vec in X:
    vec_sum = np.mean(emb_vec[:5], axis=0)
    X_transformed.append(vec_sum)

# Reshape the feature array
X_transformed = np.array(X_transformed).reshape(-1, 1)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=42)

# Apply SMOTE to the training set
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Scale the feature vectors
scaler = StandardScaler()
X_train_resampled = scaler.fit_transform(X_train_resampled)
X_test = scaler.transform(X_test)

# Train the logistic regression model
model = LogisticRegression()
model.fit(X_train_resampled, y_train_resampled)

# Make predictions on the testing set
y_pred = model.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'NonSingleton Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1-score: {f1:.2f}')


NonSingleton Accuracy: 0.40
Precision: 0.25
Recall: 0.25
F1-score: 0.25
