In [50]:
import pickle
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2

In [51]:
data_path = 'C:/Users/ASUS/TA01/02_text_preprocessing/02_pickle/02_clean_data.pickle'
with open(data_path, 'rb') as data_path:
    data = pickle.load(data_path)

data

Unnamed: 0,clean_keluhan,bagian
0,baa bantu rubah status akademik an rezza rijki...,BAA
1,baa telkom university salah alumni lihat data ...,BAA
2,haidar komplain bayar bank bni bank milik tera...,BAA
3,permisi komplain biaya didik lunas status biay...,BAA
4,wbmaaf cancel ksm dosen wali salah input mk am...,BAA
...,...,...
788,fakultas ilmu terap liburtanggal merah weekend...,LABORAN
789,gaji asisten fri banding dibandingin fakultas,LABORAN
790,mangganggu yusrin lab magics fte lampu toilet ...,LABORAN
791,olyvia fransiska teknik telekomunikasi keluh u...,LABORAN


In [52]:
# Label Encoding
category_codes = {
    'BAA': 0,
    'BAGIAN LOGISTIK DAN ASET': 1,
    'LABORAN': 2,
    'PENGELOLAAN KEGIATAN DAN KESEJAHTERAAN MAHASISWA': 3,
    'PENGEMBANGAN KARAKTER DAN KEGIATAN ASRAMA': 4,
    'PUSAT BAHASA': 5,
    'RISET DAN LAYANAN TEKNOLOGI INFORMASI': 6
}

data['bagian_label'] = LabelEncoder().fit_transform(data['bagian'])
data

Unnamed: 0,clean_keluhan,bagian,bagian_label
0,baa bantu rubah status akademik an rezza rijki...,BAA,0
1,baa telkom university salah alumni lihat data ...,BAA,0
2,haidar komplain bayar bank bni bank milik tera...,BAA,0
3,permisi komplain biaya didik lunas status biay...,BAA,0
4,wbmaaf cancel ksm dosen wali salah input mk am...,BAA,0
...,...,...,...
788,fakultas ilmu terap liburtanggal merah weekend...,LABORAN,2
789,gaji asisten fri banding dibandingin fakultas,LABORAN,2
790,mangganggu yusrin lab magics fte lampu toilet ...,LABORAN,2
791,olyvia fransiska teknik telekomunikasi keluh u...,LABORAN,2


In [53]:
# Split train-test data
X_train, X_test, y_train, y_test = train_test_split(data['clean_keluhan'],
                                                    data['bagian_label'], 
                                                    test_size=0.10, 
                                                    random_state=0)

In [54]:
# Text representation: TF-IDF
# TF-IDF parameter
ngram_range = (1,2)
min_df = 1
max_df = 1.0
max_features = 1000

tfidf = TfidfVectorizer(encoding='utf-8',
                        ngram_range=ngram_range,
                        stop_words=None,
                        lowercase=False,
                        max_df=max_df,
                        min_df=min_df,
                        max_features=max_features,
                        norm='l2',
                        sublinear_tf=True)
                        
features_train = tfidf.fit_transform(X_train).toarray()
labels_train = y_train
print(features_train.shape)

features_test = tfidf.transform(X_test).toarray()
labels_test = y_test
print(features_test.shape)

(713, 1000)
(80, 1000)


In [55]:
for Product, category_id in sorted (category_codes.items()):
    features_chi2 = chi2(features_train, labels_train == category_id)
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(tfidf.get_feature_names())[indices]
    unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
    bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
    print("# '{}' category:".format(Product))
    print("  . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-5:])))
    print("  . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-5:])))
    print("")

# 'BAA' category:
  . Most correlated unigrams:
. cetak
. legalisasi
. registrasi
. ijazah
. wisuda
  . Most correlated bigrams:
. ijazah transkrip
. transkip nilai
. igracias solusi
. laku cetak
. daftar wisuda

# 'BAGIAN LOGISTIK DAN ASET' category:
  . Most correlated unigrams:
. logistik
. aset
. ku
. parkir
. gku
  . Most correlated bigrams:
. ruang kelas
. kipas angin
. toilet gku
. tes cpns
. logistik aset

# 'LABORAN' category:
  . Most correlated unigrams:
. jurnal
. asprak
. praktikan
. lab
. praktikum
  . Most correlated bigrams:
. lampu neon
. fasilitas lab
. komputer lot
. komputer lab
. laksana praktikum

# 'PENGELOLAAN KEGIATAN DAN KESEJAHTERAAN MAHASISWA' category:
  . Most correlated unigrams:
. input
. ipk
. prestasi
. beasiswa
. tak
  . Most correlated bigrams:
. beasiswa semester
. beasiswa jpu
. tak usang
. informasi beasiswa
. input tak

# 'PENGEMBANGAN KARAKTER DAN KEGIATAN ASRAMA' category:
  . Most correlated unigrams:
. banget
. inap
. kamar
. air
. asrama
  .

In [56]:
with open('03_pickle/03_data.pickle', 'wb') as output:
    pickle.dump(data, output)

with open('03_pickle/03_X_train.pickle', 'wb') as output:
    pickle.dump(X_train, output)

with open('03_pickle/03_X_test.pickle', 'wb') as output:
    pickle.dump(X_test, output)

with open('03_pickle/03_y_train.pickle', 'wb') as output:
    pickle.dump(y_train, output)

with open('03_pickle/03_y_test.pickle', 'wb') as output:
    pickle.dump(y_test, output)

with open('03_pickle/03_features_train.pickle', 'wb') as output:
    pickle.dump(features_train, output)

with open('03_pickle/03_labels_train.pickle', 'wb') as output:
    pickle.dump(labels_train, output)

with open('03_pickle/03_features_test.pickle', 'wb') as output:
    pickle.dump(features_test, output)

with open('03_pickle/03_labels_test.pickle', 'wb') as output:
    pickle.dump(labels_test, output)

with open('03_pickle/03_tfidf.pickle', 'wb') as output:
    pickle.dump(tfidf, output)