# 1. load data

In [2]:
import joblib as jl

text_train = jl.load('text_train.jl')
text_test = jl.load('text_test.jl')
label_train = jl.load('label_train.jl')
label_test = jl.load('label_test.jl')

# 2. check data

In [7]:
print( type(text_train), len(text_train), type(label_train), len(label_train) )

<class 'list'> 90 <class 'list'> 90


In [8]:
print( type(text_test), len(text_test), type(label_test), len(label_test) )

<class 'list'> 18 <class 'list'> 18


In [11]:
label_train# 0 is good, 1 is gibber

[0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1]

In [12]:
text_train[:5]

['I missed out on a delivery day when I clicked check in and waited for my turn to get an order only to find out that not only did my check in not register but the gps showed me down the street. I encountered this issue again when one of the warehouse employees placed an order for that location and the app wanted me to drive in a big circle to get back to where I was standing.',
 "I've arrived at the pick up restaurant but the staff did not have the barecode for me to scan, however I pick up the package and deliver but my is still not let me move on",
 'ews ri',
 "it doesn't work sometimes.",
 'Suggestion: easier way to sign in due alleviate the tediousness of periodically having to sign back in to the app to check for blocks.']

# 3. feature engineering

In [18]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np


vectorizer = CountVectorizer()
x_train = vectorizer.fit_transform(text_train)
y_train = np.array(label_train)

x_test = vectorizer.transform(text_test)
y_test = np.array(label_test)

In [55]:
# 词汇表
vectorizer.vocabulary_ 

{'10': 0,
 '1200': 1,
 '1203': 2,
 '15': 3,
 '150': 4,
 '15107': 5,
 '1pm': 6,
 '20l': 7,
 '25': 8,
 '2s': 9,
 '2w222': 10,
 '33rd': 11,
 '40': 12,
 '5am': 13,
 '7k9opu': 14,
 'aaq': 15,
 'aawfhg': 16,
 'ability': 17,
 'able': 18,
 'about': 19,
 'acceptable': 20,
 'access': 21,
 'accurately': 22,
 'additional': 23,
 'advance': 24,
 'af0000': 25,
 'after': 26,
 'ag0003006033sdgcj12344': 27,
 'again': 28,
 'ages': 29,
 'ahead': 30,
 'al001234': 31,
 'alerts': 32,
 'all': 33,
 'alleviate': 34,
 'allow': 35,
 'am': 36,
 'an': 37,
 'and': 38,
 'android': 39,
 'any': 40,
 'app': 41,
 'appointments': 42,
 'are': 43,
 'arent': 44,
 'arrived': 45,
 'ask': 46,
 'asks': 47,
 'ass': 48,
 'assign': 49,
 'assigned': 50,
 'at': 51,
 'availability': 52,
 'available': 53,
 'awa': 54,
 'awesome': 55,
 'azc': 56,
 'back': 57,
 'background': 58,
 'barecode': 59,
 'bbb': 60,
 'bd': 61,
 'be': 62,
 'bee': 63,
 'been': 64,
 'being': 65,
 'bellevue': 66,
 'benefits': 67,
 'best': 68,
 'better': 69,
 'big': 70

In [19]:
x_train.shape

(90, 523)

In [20]:
y_train.shape

(90,)

In [21]:
x_test.shape

(18, 523)

In [22]:
y_test.shape

(18,)

# 4. model training & predict & evaluation

## 4.1 linear model

In [25]:
from sklearn import metrics

In [28]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
y_true = y_test
print( metrics.classification_report(y_true, y_pred) )

             precision    recall  f1-score   support

          0       1.00      0.17      0.29        12
          1       0.38      1.00      0.55         6

avg / total       0.79      0.44      0.37        18



## 4.2 decision tree

In [47]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(criterion='gini', max_depth=10, random_state=88)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
y_true = y_test
print( metrics.classification_report(y_true, y_pred) )

             precision    recall  f1-score   support

          0       1.00      0.42      0.59        12
          1       0.46      1.00      0.63         6

avg / total       0.82      0.61      0.60        18



## 4.3 svm

In [54]:
from sklearn.svm import SVC

model = SVC(C=0.5, kernel='poly', degree=30)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
y_true = y_test
print( metrics.classification_report(y_true, y_pred) )

             precision    recall  f1-score   support

          0       0.00      0.00      0.00        12
          1       0.33      1.00      0.50         6

avg / total       0.11      0.33      0.17        18



  'precision', 'predicted', average, warn_for)
