In [90]:
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import pandas as pd
from sklearn.model_selection import StratifiedKFold

In [91]:
apple_company = open('data/apple-computers.txt').read()
apple_fruit = open('data/apple-fruit.txt').read()

In [92]:
apple_company = apple_company.split('\n')
apple_fruit = apple_fruit.split('\n')

In [93]:
def clean_texts(texts):
    texts = [text.lower() for text in texts if text != '']
    # remove hyperlink references e.g. [3]
    texts = [re.sub('(\[\d+\])', '', text) for text in texts]
    # remove special characters like \t 
    texts = [re.sub('[^A-Za-z0-9 \.\,]+', ' ', text) for text in texts]
    return texts
apple_company = clean_texts(apple_company)
apple_fruit = clean_texts(apple_fruit)

In [94]:
df = pd.DataFrame(zip(apple_company + apple_fruit,
                      len(apple_company)*['computer-company'] + len(apple_fruit)*['fruit']), columns=['text', 'label'])
df['label'].value_counts()

computer-company    283
fruit               187
Name: label, dtype: int64

In [95]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=23)

In [96]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline

from pycm import *
# le = LabelEncoder()
X = df['text'].values
y = df['label'].values
# print(y)
for train_idx, test_idx in skf.split(X=X, y=y):
#     model = LogisticRegression(C=0.7, class_weight={'computer-company': 1, 'fruit': 1.5})
    model = SVC(C=0.9, class_weight={'computer-company': 1, 'fruit': 1.5}, kernel='linear')
    X_train, X_test, y_train, y_test = X[train_idx], X[test_idx], y[train_idx], y[test_idx]
    
    tfidf = TfidfVectorizer(ngram_range=(1,1))
    X_train = tfidf.fit_transform(X_train)
#     pipeline = Pipeline([
#         ('tfidf', TfidfVectorizer()),
#         ('clf', LogisticRegression(C=0.7, class_weight={'computer-company': 1, 'fruit': 1.5})),
#     ])
#     model = pipeline
    model.fit(X_train, y_train)
    X_test = tfidf.transform(X_test)
    y_pred = model.predict(X_test)
    cm = ConfusionMatrix(actual_vector=y_test, predict_vector=y_pred)
#     print(pd.DataFrame(cm.matrix))
    print(cm.ACC_Macro)
    print('------------')

0.947368421053
------------
0.915789473684
------------
0.936170212766
------------
0.870967741935
------------
0.903225806452
------------


In [97]:
import pickle
pickle.dump([model, tfidf], open('data/model_trainedv1.pic', 'w'))

In [89]:
X_test = [
  """
  profit very low in rainy season
  """,
"""
profits very low for samsung
"""
]
X_test = tfidf.transform(X_test)
y_pred = model.predict(X_test)
y_pred

array(['fruit', 'computer-company'], dtype=object)