# Logistic regression Classifier

### load data

In [6]:
import os
import pandas as pd
data_dir = "/home/yick/Projects/github.com/text-classifier/data"
train_file = os.path.join(data_dir, "train_data.csv")
test_file = os.path.join(data_dir, "test_data.csv")
train_df = pd.read_csv(train_file)
test_df = pd.read_csv(test_file)
print(f"train_df shape: {train_df.shape}")
print(f"test_df shape: {test_df.shape}")

train_df shape: (8718, 2)
test_df shape: (741, 2)


### make label encoder

In [7]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(train_df["label"].tolist())
num_labels = len(label_encoder.classes_)

### feature extraction

In [8]:
import jieba
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
count_transformer = CountVectorizer(
    tokenizer=lambda d:[w for w in jieba.cut(d)],
    analyzer="word",
    min_df=2,
    max_df=0.5,
    token_pattern=r"(?u)\b\w+\b",
    max_features=20000,
    ngram_range=(1, 2)
)
tfidf_transformer = TfidfTransformer()
count_vector = count_transformer.fit_transform(train_df["text"].tolist())
tfidf_vector = tfidf_transformer.fit_transform(count_vector)
print(f"tfidf_vector shape: {tfidf_vector.shape}")

tfidf_vector shape: (8718, 5762)


### model train

In [9]:
from sklearn.linear_model import LogisticRegression
classifier=LogisticRegression(
    solver="liblinear",
    dual=False,
    C=15,
    max_iter=100,
    verbose=0,
    class_weight="balanced"
)
classifier.fit(tfidf_vector, labels)

LogisticRegression(C=15, class_weight='balanced', solver='liblinear')

### model test

In [10]:
test_count_vector = count_transformer.transform(test_df["text"].tolist())
test_tfidf_vector = tfidf_transformer.transform(test_count_vector)
preds = classifier.predict(test_tfidf_vector)
from sklearn.metrics import classification_report
pred_labels = label_encoder.inverse_transform(preds)
true_labels = test_df["label"].tolist()
report = classification_report(true_labels, pred_labels,  digits=4)
print(report)

                precision    recall  f1-score   support

           上征信     1.0000    1.0000    1.0000         1
           不专业     0.0000    0.0000    0.0000         0
           不舒服     0.0000    0.0000    0.0000         0
         之前被拒了     0.0000    0.0000    0.0000         1
          人工服务     1.0000    0.4286    0.6000         7
          什么平台     0.8857    1.0000    0.9394        31
       会不会放款失败     0.0000    0.0000    0.0000         4
           利息高     1.0000    0.1667    0.2857         6
        号码是哪来的     0.0000    0.0000    0.0000         0
         否定/拒绝     0.7952    0.7857    0.7904        84
       咨询APP名字     0.6250    1.0000    0.7692         5
       咨询利息/费用     0.8857    0.8378    0.8611        37
        咨询提前还款     1.0000    1.0000    1.0000         1
          咨询操作     0.8750    0.4667    0.6087        30
        咨询放款速度     0.3333    0.5000    0.4000         4
       咨询额度-通用     0.7188    0.8519    0.7797        54
     嗯啊哦额/模糊回答     0.5333    0.8889    0.6667  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
