In [None]:
import numpy as np
from gensim.models import Word2Vec
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# farm-ads einlesen
with open("farm-ads", "r") as f:
    lines = f.readlines()

labels = []
documents = []

for line in lines:
    # Einzelne ADs
    parts = line.strip().split()
    # Label vor AD 1 (accepted) -1 (rejected)
    label = int(parts[0])
    # Wörter im AD
    tokens = parts[1:]
    labels.append(label)
    documents.append(tokens)

# Modell trainieren
w2v_model = Word2Vec(sentences=documents, vector_size=100, window=5, min_count=1, sg=1, epochs=10)

# Ähnliche Wörter finden
print(w2v_model.wv.most_similar("ad-cow", topn=5))

def doc_vector(doc, model):
    vectors = [model.wv[word] for word in doc if word in model.wv]
    if not vectors:
        return np.zeros(model.vector_size)
    return np.mean(vectors, axis=0)

doc_vectors = np.array([doc_vector(doc, w2v_model) for doc in documents])

X_train, X_test, y_train, y_test = train_test_split(doc_vectors, labels, test_size=0.2, random_state=42)

clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)

pred = clf.predict(X_test)
print(classification_report(y_test, pred))

def classify_custom_ad(text, model, classifier):
    tokens = text.lower().split()
    vector = doc_vector(tokens, model).reshape(1, -1)
    pred = classifier.predict(vector)[0]
    return "Accepted" if pred == 1 else "Rejected"

pca = PCA(n_components=2)
X_reduced = pca.fit_transform(doc_vectors)

plt.figure(figsize=(10, 6))
colors = ['red' if l == -1 else 'green' for l in labels]
plt.scatter(X_reduced[:, 0], X_reduced[:, 1], c=colors, alpha=0.5)
plt.title("Farm Ads - PCA Visualisierung")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.show()


In [None]:
new_ad = "selling bfh cheap"
result = classify_custom_ad(new_ad, w2v_model, clf)
print("Prediction:", result)