## Mengambil Kata Kunci dari data ulasan karyawan Shopee

In [None]:
# Install library
%pip install pandas scikit-learn spacy
python -m spacy download en_core_web_sm

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
import re

# 1. LOAD ULASAN DARI FILE
df = pd.read_csv('shopee_cleaned_reviews.csv')
pros = df['Review Pros'].dropna().astype(str)
cons = df['Review Cons'].dropna().astype(str)
all_reviews = pd.concat([pros, cons], ignore_index=True).str.lower()

# 2. BERSIHKAN TEKS
all_reviews = all_reviews.apply(lambda x: re.sub(r'[^a-z\s]', '', x))

# 3. TF-IDF EKSTRAKSI
vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words='english', max_df=0.8, min_df=5)
X = vectorizer.fit_transform(all_reviews)
tfidf_scores = zip(vectorizer.get_feature_names_out(), X.sum(axis=0).tolist()[0])
sorted_keywords = sorted(tfidf_scores, key=lambda x: x[1], reverse=True)

# 4. FILTER KATA BENDA / ADJEKTIVA
nlp = spacy.load("en_core_web_sm")
filtered_terms = []
for term, score in sorted_keywords:
    doc = nlp(term)
    if all(token.pos_ in ['NOUN', 'ADJ'] for token in doc):
        filtered_terms.append((term, score))

# 5. SIMPAN HASIL
df_terms = pd.DataFrame(filtered_terms, columns=['term', 'score'])
df_terms.to_csv('calon_aspek_dari_shopee.csv', index=False)
print(f"✅ Selesai — {len(df_terms)} calon aspek disimpan di 'calon_aspek_dari_shopee.csv'")


✅ Selesai — 378 calon aspek disimpan di 'calon_aspek_dari_shopee.csv'


### Memastikan hanya kata kunci yang relevan yang dipilih

In [None]:
import pandas as pd
import spacy

# Muat model bahasa Inggris dari spaCy
nlp = spacy.load("en_core_web_sm")

# Baca file term hasil ekstraksi
df = pd.read_csv("calon_aspek_dari_shopee.csv")

# Fungsi: hanya simpan term yang TIDAK mengandung adjective
def is_not_adj(term):
    doc = nlp(term)
    return not any(token.pos_ == 'ADJ' for token in doc)

# Terapkan filter
df_filtered = df[df['term'].apply(is_not_adj)].reset_index(drop=True)

# Simpan hasil
df_filtered.to_csv("calon_aspek_noun_only.csv", index=False)
print(f"✅ Selesai — {len(df_filtered)} term disimpan di calon_aspek_noun_only.csv")


✅ Selesai — 226 term disimpan di calon_aspek_noun_only.csv


## Klasifikasi kata kunci ke dalam aspek (updated ver.)

In [None]:
# ---------- Install jika belum ----------
# pip install pandas sentence-transformers torch

import pandas as pd
from sentence_transformers import SentenceTransformer, util
import torch

# ---------- 1. LOAD DATA ----------
df_terms = pd.read_csv("input_term_new.csv")  # pastikan file ini tersedia di folder kerja
terms = df_terms['term'].dropna().astype(str).tolist()

# ---------- 2. DEFINISI ASPEK, KATEGORI & DESKRIPSI ----------
herzberg_aspects = [
    'Achievement', 'Recognition', 'Work Itself', 'Responsibility', 'Growth', 'Advancement',
    'Compensation', 'Supervision', 'Work Condition', 'Interpersonal Relations',
    'Company Policy', 'Job Security', 'Personal Life'
]

aspect_to_category = {
    'Achievement': 'Motivator',
    'Recognition': 'Motivator',
    'Work Itself': 'Motivator',
    'Responsibility': 'Motivator',
    'Growth': 'Motivator',
    'Advancement': 'Motivator',
    'Compensation': 'Hygiene',
    'Supervision': 'Hygiene',
    'Work Condition': 'Hygiene',
    'Interpersonal Relations': 'Hygiene',
    'Company Policy': 'Hygiene',
    'Job Security': 'Hygiene',
    'Personal Life': 'Hygiene'
}

aspect_descriptions = {
    'Achievement': "Feeling a sense of accomplishment and success in completing meaningful work tasks.",
    'Recognition': "Receiving praise, appreciation, or acknowledgment for one’s contributions and performance.",
    'Work Itself': "The actual tasks and responsibilities of the job being engaging, enjoyable, or fulfilling.",
    'Responsibility': "Having autonomy, authority, or being trusted with important duties and decisions.",
    'Growth': "Opportunities for personal learning, skill development, and self-improvement through work.",
    'Advancement': "Possibility of promotion, career progression, or moving up within the organization.",
    'Compensation': "The salary, bonuses, and other financial rewards provided for performing the job.",
    'Supervision': "The quality and fairness of guidance, support, and feedback from managers or supervisors.",
    'Work Condition': "The physical environment, tools, infrastructure, and overall conditions of the workplace.",
    'Interpersonal Relations': "The nature of relationships and social interactions with coworkers, managers, and peers.",
    'Company Policy': "The rules, procedures, and administrative practices set by the organization.",
    'Job Security': "The perceived stability and continuity of employment within the organization.",
    'Personal Life': "The extent to which work allows balance with family time, health, and personal well-being."
}

aspect_sentences = [aspect_descriptions[asp] for asp in herzberg_aspects]

# ---------- 3. LOAD MODEL DAN ENCODING ----------
model = SentenceTransformer('all-MiniLM-L6-v2')  # Ganti ke 'all-mpnet-base-v2' jika mau hasil lebih akurat
#model = SentenceTransformer('all-mpnet-base-v2')  # Ganti ke 'all-mpnet-base-v2' jika mau hasil lebih akurat
term_embeddings = model.encode(terms, convert_to_tensor=True)
description_embeddings = model.encode(aspect_sentences, convert_to_tensor=True)

# ---------- 4. KLASIFIKASI DENGAN DESKRIPSI ----------
assigned_aspects = []
aspect_meanings = []
similarity_scores = []

for emb in term_embeddings:
    sims = util.cos_sim(emb, description_embeddings)[0]
    best_idx = torch.argmax(sims).item()
    best_score = sims[best_idx].item()
    best_aspect = herzberg_aspects[best_idx]
    best_description = aspect_sentences[best_idx]

    assigned_aspects.append(best_aspect)
    aspect_meanings.append(best_description)
    similarity_scores.append(best_score)

# ---------- 5. SIMPAN HASIL ----------
df_terms['Aspect'] = assigned_aspects
df_terms['Category'] = df_terms['Aspect'].map(aspect_to_category)
df_terms['Aspect_Description'] = aspect_meanings
df_terms['Similarity_Score'] = similarity_scores

df_terms.to_csv("hasil_klasifikasi_dengan_deskripsi_aspek.csv", index=False)
print("✅ Hasil disimpan ke hasil_klasifikasi_dengan_deskripsi_aspek.csv")


✅ Hasil disimpan ke hasil_klasifikasi_dengan_deskripsi_aspek.csv
