In [29]:
import pickle
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2

In [31]:
data_path = 'C:/Users/ASUS/TA01/02_text_preprocessing/02_pickle/02_clean_data.pickle'
with open(data_path, 'rb') as data_path:
    data = pickle.load(data_path)

data

Unnamed: 0,clean_keluhan,bagian
0,baamohon bantu rubah satatus akademik an rezza...,BAA
1,baa telkom university salah alumni lihat data ...,BAA
2,haidar komplain bayar bank bni bank milik tera...,BAA
3,permisi komplain biaya didik lunas status biay...,BAA
4,wbmaaf cancel ksm dosen wali salah input mk am...,BAA
...,...,...
788,fakultas ilmu terap liburtanggal merah weekend...,laboran
789,gaji asisten fri banding dibandingin fakultas,laboran
790,mangganggu yusrin lab magics fte lampu toilet ...,laboran
791,olyvia fransiska teknik telekomunikasi keluh u...,laboran


In [46]:
# Label Encoding
category_codes = {
    'BAA': 0,
    'Pusat Bahasa': 1,
    'Riset dan Layanan Teknologi Informasi': 2,
    'Bagian Logistik dan Aset': 3,
    'Pengembangan Karakter dan Kegiatan Asrama': 4,
    'pengelolaan kegiatan dan kesejahteraan mahasiswa': 5,
    'laboran': 6
}

data['bagian_label'] = LabelEncoder().fit_transform(data['bagian'])
data

Unnamed: 0,clean_keluhan,bagian,bagian_label
0,baamohon bantu rubah satatus akademik an rezza...,BAA,0
1,baa telkom university salah alumni lihat data ...,BAA,0
2,haidar komplain bayar bank bni bank milik tera...,BAA,0
3,permisi komplain biaya didik lunas status biay...,BAA,0
4,wbmaaf cancel ksm dosen wali salah input mk am...,BAA,0
...,...,...,...
788,fakultas ilmu terap liburtanggal merah weekend...,laboran,5
789,gaji asisten fri banding dibandingin fakultas,laboran,5
790,mangganggu yusrin lab magics fte lampu toilet ...,laboran,5
791,olyvia fransiska teknik telekomunikasi keluh u...,laboran,5


In [34]:
# Split train-test data
X_train, X_test, y_train, y_test = train_test_split(data['clean_keluhan'],
                                                    data['bagian_label'], 
                                                    test_size=0.25, 
                                                    random_state=8)

In [35]:
# Text representation: TF-IDF
# TF-IDF parameter
ngram_range = (1,2)
min_df = 1
max_df = 1.0
max_features = 300

tfidf = TfidfVectorizer(encoding='utf-8',
                        ngram_range=ngram_range,
                        stop_words=None,
                        lowercase=False,
                        max_df=max_df,
                        min_df=min_df,
                        max_features=max_features,
                        norm='l2',
                        sublinear_tf=True)
                        
features_train = tfidf.fit_transform(X_train).toarray()
labels_train = y_train
print(features_train.shape)

features_test = tfidf.transform(X_test).toarray()
labels_test = y_test
print(features_test.shape)

(594, 300)
(199, 300)


In [36]:
for Product, category_id in sorted(category_codes.items()):
    features_chi2 = chi2(features_train, labels_train == category_id)
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(tfidf.get_feature_names())[indices]
    unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
    bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
    print("# '{}' category:".format(Product))
    print("  . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-5:])))
    print("  . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-2:])))
    print("")

# 'BAA' category:
  . Most correlated unigrams:
. registrasi
. menu
. cetak
. ijazah
. wisuda
  . Most correlated bigrams:
. laku registrasi
. bapakibu admin

# 'Bagian Logistik dan Aset' category:
  . Most correlated unigrams:
. skor
. pusat
. bahasa
. test
. eprt
  . Most correlated bigrams:
. test eprt
. pusat bahasa

# 'Pengembangan Karakter dan Kegiatan Asrama' category:
  . Most correlated unigrams:
. email
. office
. login
. akun
. tune
  . Most correlated bigrams:
. username password
. mata kuliah

# 'Pusat Bahasa' category:
  . Most correlated unigrams:
. logistik
. aset
. parkir
. ku
. gku
  . Most correlated bigrams:
. bayar bpp
. logistik aset

# 'Riset dan Layanan Teknologi Informasi' category:
  . Most correlated unigrams:
. gedung
. kotor
. kamar
. air
. asrama
  . Most correlated bigrams:
. mata kuliah
. asrama gedung

# 'laboran' category:
  . Most correlated unigrams:
. usang
. prestasi
. ipk
. input
. beasiswa
  . Most correlated bigrams:
. beasiswa jpu
. informasi b