## Linnear Regression

In [1]:
import pandas as pd
from tqdm import tqdm_notebook
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer
from scipy.sparse import hstack
from sklearn.linear_model import Ridge

In [2]:
data = pd.read_csv('salary-train.csv')
test_data = pd.read_csv('salary-test-mini.csv')
data.head()

Unnamed: 0,FullDescription,LocationNormalized,ContractTime,SalaryNormalized
0,International Sales Manager London ****k ****...,London,permanent,33000
1,An ideal opportunity for an individual that ha...,London,permanent,50000
2,Online Content and Brand Manager// Luxury Reta...,South East London,permanent,40000
3,A great local marketleader is seeking a perman...,Dereham,permanent,22500
4,Registered Nurse / RGN Nursing Home for Young...,Sutton Coldfield,,20355


In [3]:
data['FullDescription'] = data['FullDescription'].apply(lambda x: re.sub('[^a-zA-Z0-9]', ' ', x.lower()))
data['LocationNormalized'] = data['LocationNormalized'].apply(lambda x: re.sub('[^a-zA-Z0-9]', ' ', x.lower()))

data['LocationNormalized'].fillna('nan', inplace=True)
data['ContractTime'].fillna('nan', inplace=True)

y_train = data['SalaryNormalized']

In [4]:
test_data['FullDescription'] = test_data['FullDescription'].apply(lambda x: re.sub('[^a-zA-Z0-9]', ' ', x.lower()))
test_data['LocationNormalized'] = test_data['LocationNormalized'].apply(lambda x: re.sub('[^a-zA-Z0-9]', ' ', x.lower()))

test_data['LocationNormalized'].fillna('nan', inplace=True)
test_data['ContractTime'].fillna('nan', inplace=True)

In [5]:
dictV = DictVectorizer()
X_train = dictV.fit_transform(data[['LocationNormalized', 'ContractTime']].to_dict('records'))
X_test = dictV.transform(test_data[['LocationNormalized', 'ContractTime']].to_dict('records'))

In [6]:
tfidf = TfidfVectorizer(min_df = 5)
train_desc = tfidf.fit_transform(data['FullDescription'])
test_desc = tfidf.transform(test_data['FullDescription'])

In [7]:
X_train = hstack([train_desc, X_train])
X_test = hstack([test_desc, X_test])

In [9]:
ridge = Ridge(alpha=1, random_state=241)
ridge.fit(X_train, y_train)

res = ridge.predict(X_test)

In [11]:
res = [str(round(ans, 2)) for ans in list(res)]

In [12]:
f = open('linnear-ans.txt', 'w')
f.write(' '.join(res))
f.close()