In [61]:
import re
import pandas
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer
from scipy.sparse import hstack
from sklearn.linear_model import Ridge

# get test data
data_test = pandas.read_csv('salary-test-mini.csv')

# get train data
data = pandas.read_csv('salary-train.csv')

# convert FullDescription (use TF-IDF) for train
data['FullDescription'] = data['FullDescription'].apply(lambda x: re.sub('[^a-zA-Z0-9]', ' ', x.lower()))
vectorizer = TfidfVectorizer()
FullDescription_TF_IDF = vectorizer.fit_transform(data['FullDescription'])

# convert FullDescription (use TF-IDF) for test
data_test['FullDescription'] = data_test['FullDescription'].apply(lambda x: re.sub('[^a-zA-Z0-9]', ' ', x.lower()))
FullDescription_TF_IDF_test = vectorizer.transform(data_test['FullDescription'])

# replace empty to nan for train
data['LocationNormalized'].fillna('nan', inplace=True)
data['ContractTime'].fillna('nan', inplace=True)

# replace empty to nan for test
data_test['LocationNormalized'].fillna('nan', inplace=True)
data_test['ContractTime'].fillna('nan', inplace=True)

# use one-hot convert for train
enc = DictVectorizer()
m_train_features = enc.fit_transform(data[['LocationNormalized', 'ContractTime']].to_dict('records'))

# use one-hot convert for test
m_test_features = enc.transform(data_test[['LocationNormalized', 'ContractTime']].to_dict('records'))

# combine in 1 matrix for train
train_features = hstack([FullDescription_TF_IDF, m_train_features])

# combine in 1 matrix for test
test_features = hstack([FullDescription_TF_IDF_test, m_test_features])

# train
clf = Ridge(alpha=1.0)
clf.fit(train_features, data['SalaryNormalized'])

# predict
print(clf.predict(test_features))

[ 56963.64408442  37497.57446373]
