In [35]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy as sp
from sklearn.linear_model import Ridge
import numpy as np

In [32]:
"""
Data loading
"""
df = pd.read_csv('salary-train.csv')
df_pred = pd.read_csv('salary-test-mini.csv')
df.head()

Unnamed: 0,FullDescription,LocationNormalized,ContractTime,SalaryNormalized
0,International Sales Manager London ****k ****...,London,permanent,33000
1,An ideal opportunity for an individual that ha...,London,permanent,50000
2,Online Content and Brand Manager// Luxury Reta...,South East London,permanent,40000
3,A great local marketleader is seeking a perman...,Dereham,permanent,22500
4,Registered Nurse / RGN Nursing Home for Young...,Sutton Coldfield,,20355


In [33]:
"""
Data preparing
"""
# Меняем пропуски на 'nan'
df['LocationNormalized'].fillna('nan', inplace=True)
df['ContractTime'].fillna('nan', inplace=True)

df_pred['LocationNormalized'].fillna('nan', inplace=True)
df_pred['ContractTime'].fillna('nan', inplace=True)

# Vectorizer для LocationNormalized и ContractTime
vectorizer = DictVectorizer()
X_loc_contr = vectorizer.fit_transform(
    df[['LocationNormalized', 'ContractTime']].to_dict('records'))
X_loc_contr_pred = vectorizer.transform(
    df_pred[['LocationNormalized', 'ContractTime']].to_dict('records'))

# FullDescription parsing
df['FullDescription'] = df['FullDescription'].apply(lambda x: x.lower())
df['FullDescription'] = df['FullDescription'].replace('[^a-zA-Z0-9]',
                                                      ' ',
                                                      regex=True)
text_vectorizer = TfidfVectorizer(min_df=5)
X_full_description = text_vectorizer.fit_transform(df['FullDescription'])

df_pred['FullDescription'] = df_pred['FullDescription'].apply(
    lambda x: x.lower())
df_pred['FullDescription'] = df_pred['FullDescription'].replace('[^a-zA-Z0-9]',
                                                                ' ',
                                                                regex=True)
X_full_description_pred = text_vectorizer.transform(df_pred['FullDescription'])

# Combination of FullDescription and X_loc_contr
data = sp.sparse.hstack([X_full_description, X_loc_contr])
data_pred = sp.sparse.hstack([X_full_description_pred, X_loc_contr_pred])
print(data.shape[0])

60000


In [30]:
"""
Regressor preparing
"""
rgr = Ridge(alpha=1.0, random_state=241)
rgr.fit(data, df['SalaryNormalized'])

Ridge(random_state=241)

In [36]:
"""
Prediction making
"""
ans = rgr.predict(data_pred)
ans = [str(np.round(x)) for x in ans]

with open("w4s1.txt", "w") as fhand:
    fhand.write(" ".join(ans))