# Linear models

### Outline <a name = 'outline'></a>
* [Data generation](#data) 
* [Model training](#models) 
* [Text Data Classification](#text)

In [2]:
import matplotlib
from matplotlib import pyplot as plt
matplotlib.pyplot.style.use('ggplot')

from sklearn import datasets, linear_model, metrics, model_selection, pipeline, preprocessing, multiclass
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import numpy as np
import pandas as pd
import ast
import string
import re

In [3]:
train = pd.read_csv('train.csv',sep=',')
train['dialogue'] = train['dialogue'].apply(lambda t: re.sub(r"<BR>", " ", t))
train

Unnamed: 0,id,movie,dialogue,genres
0,0,0,I thought you were in a meeting--? I am. Wi...,"[u'drama', u'romance']"
1,1,1,Are you sure you're okay? You're pale. I fe...,[u'drama']
2,2,2,Go on! Get out! Mom look don't say anything....,[u'comedy']
3,3,3,I could have lost my fucking hands. That wou...,"[u'mystery', u'thriller']"
4,4,4,Stick with me on this Gloria. I need you... ...,"[u'crime', u'thriller']"
...,...,...,...,...
36986,36986,246,There's a man downstairs. He brought us eggs....,"[u'drama', u'war']"
36987,36987,43,Hi. I'd prefer it if you didn't speak to me....,"[u'comedy', u'drama']"
36988,36988,459,I tried to call you I'm running a little late ...,[u'drama']
36989,36989,174,What are you crazy? I just thought we should...,"[u'drama', u'romance']"


In [4]:
data = pd.DataFrame(train['dialogue'].str.replace('[^\w\s]', ''))
data = pd.DataFrame(data['dialogue'].str.lower())

data['genres'] = train['genres'].map(lambda t: ast.literal_eval(t))
genres = []
for _ in data['genres']:
    genres.extend(_)
genres = list(set(genres))
data

Unnamed: 0,dialogue,genres
0,i thought you were in a meeting i am with you,"[drama, romance]"
1,are you sure youre okay youre pale i feel l...,[drama]
2,go on get out mom look dont say anything fir...,[comedy]
3,i could have lost my fucking hands that woul...,"[mystery, thriller]"
4,stick with me on this gloria i need you and...,"[crime, thriller]"
...,...,...
36986,theres a man downstairs he brought us eggs h...,"[drama, war]"
36987,hi id prefer it if you didnt speak to me i...,"[comedy, drama]"
36988,i tried to call you im running a little late t...,[drama]
36989,what are you crazy i just thought we should ...,"[drama, romance]"


In [6]:
train_data, test_data, Y, Y_test = model_selection.train_test_split(data.dialogue, data.genres, test_size = 0.1, random_state = 12)
lb = preprocessing.MultiLabelBinarizer(classes = genres)
train_ans = pd.DataFrame(lb.fit_transform(Y), columns = genres)
test_ans = pd.DataFrame(lb.fit_transform(Y_test), columns = genres)

In [7]:
train_data

21767    yeah right  thats where youd go to find girls ...
36292    now thats practically uncut you could if you s...
17844    sorry miss schaefer but ive given up writing o...
13664    i think anybody she ever loves tends to die on...
22224    so its not a meaningful symbol or anything tha...
                               ...                        
12108    i feel giddy   but maude you dont understand i...
9475     hey its the invisible man brother where you be...
36482    pop i think we should sell this place get outt...
19709    i dont know why you didnt tell me about this  ...
14155    i was hoping to perhaps direct some episodes m...
Name: dialogue, Length: 33291, dtype: object

In [9]:
#option 1: count vectorizer
vectorizer = CountVectorizer(min_df = 10, stop_words={'english'})

vectorized_train_data = vectorizer.fit_transform(train_data)
vectorized_test_data = vectorizer.transform(test_data)
print(vectorized_train_data)

  (0, 6006)	1
  (0, 4423)	1
  (0, 5325)	1
  (0, 5866)	1
  (0, 6023)	1
  (0, 2237)	1
  (0, 5428)	1
  (0, 1984)	1
  (0, 2223)	1
  (0, 2460)	1
  (0, 2184)	1
  (0, 721)	1
  (0, 5326)	1
  (0, 5801)	1
  (0, 2255)	1
  (0, 2055)	1
  (0, 6022)	1
  (1, 5325)	1
  (1, 6022)	2
  (1, 3623)	1
  (1, 4044)	1
  (1, 1181)	2
  (1, 2599)	1
  (1, 4901)	1
  (1, 1418)	2
  :	:
  (33290, 451)	1
  (33290, 6025)	1
  (33290, 3755)	1
  (33290, 4936)	1
  (33290, 6019)	1
  (33290, 281)	1
  (33290, 3547)	1
  (33290, 3990)	1
  (33290, 3872)	1
  (33290, 3493)	1
  (33290, 3314)	2
  (33290, 3695)	1
  (33290, 532)	1
  (33290, 911)	1
  (33290, 4025)	1
  (33290, 3931)	2
  (33290, 5701)	2
  (33290, 1475)	2
  (33290, 1479)	3
  (33290, 3885)	1
  (33290, 63)	1
  (33290, 3212)	1
  (33290, 5623)	1
  (33290, 4713)	3
  (33290, 3636)	1


**Modeling**

In [19]:
#count vectorizer is used
model = multiclass.OneVsRestClassifier(estimator= linear_model.LogisticRegression(max_iter = 3000))

In [20]:
train_preds = model.predict(vectorized_train_data)
test_preds = model.predict(vectorized_test_data)

NotFittedError: This OneVsRestClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [None]:
tfidf_pipe = pipeline.Pipeline(steps = [('vectorizer', TfidfVectorizer(min_df = 3, 
                                                                       stop_words={'english'})), 
                                        ('classifier', model)])

In [None]:
tfidf_pipe.fit(train_data, train_ans)
q = pd.DataFrame(columns = ['Score', 'F1'])

In [None]:
predict

In [None]:
for k in range(0, 11):
    p = 0.292 + k / 10000
    predict =  tfidf_pipe.predict_proba(test_data)
    predict[predict >= p] = 1
    predict[predict < p] = 0
    qwer = pd.DataFrame(predict, columns = genres)
    t = metrics.f1_score(test_ans, qwer, average = 'samples')
    q.loc[k] = [p, t]
q

In [None]:
qwer

In [None]:
question = pd.read_csv('test.csv',sep=',')
question_data = pd.DataFrame(question['dialogue'].str.replace('[^\w\s]',''))
question_data['dialogue'] = question_data['dialogue'].apply(lambda t: re.sub(r"BR", "", t))
question_data = pd.DataFrame(question_data['dialogue'].str.replace('[^\w\s]', ''))
question_data = pd.DataFrame(question_data['dialogue'].str.lower())

vectorized_question_data = vectorizer.transform(question_data)
question_data

In [None]:
ans =  tfidf_pipe.predict(question_data['dialogue'])
ans

In [21]:
tfidf_pipe.predict(question_data)

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [22]:
out1 = lb.inverse_transform(ans)
out = [' '.join(x) for x in out1]

exit = pd.DataFrame(out, columns = ['genres'])
exit.index.names = ['id']
out

NameError: name 'ans' is not defined

In [23]:
exit.to_csv('out1.csv', index=True)

AttributeError: 'ZMQExitAutocall' object has no attribute 'to_csv'

In [24]:
tfidf_pipe.predict_proba(question_data)

array([[0.24520409, 0.00296291, 0.0056897 , 0.00488209, 0.46049631,
        0.00572824, 0.12738541, 0.00629805, 0.01157039, 0.03550831,
        0.03226712, 0.39011778, 0.0016829 , 0.13611927, 0.07297609,
        0.11304038, 0.14797405, 0.00100918, 0.06030631, 0.03463154]])

In [25]:
out2 = []
for i in out:
    if i:
        out2.append(i)
    else:
        out2.append('drama')
len(out2)

NameError: name 'out' is not defined

In [26]:
exit2 = pd.DataFrame(out2, columns = ['genres'])
exit2.index.names = ['id']
exit2.to_csv('out2.csv', index=True)

In [28]:
ase = pd.DataFrame(lb.fit_transform(data['genres']), columns = genres)
tfidf_pipe.fit(data['dialogue'], ase)


Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(min_df=3, stop_words={'english'})),
                ('classifier',
                 OneVsRestClassifier(estimator=LogisticRegression(max_iter=3000)))])

In [29]:
ans2 =  tfidf_pipe.predict_proba(question_data['dialogue'])

In [30]:
ans2[5]

array([0.48589087, 0.00572766, 0.00320706, 0.00178886, 0.01200285,
       0.00179185, 0.13582525, 0.00629177, 0.00775192, 0.0056219 ,
       0.04561865, 0.27473716, 0.00150849, 0.04616735, 0.02712436,
       0.00486214, 0.00474943, 0.00088982, 0.478818  , 0.00929583])

In [31]:
ans2[ans2>=0.2923] = 1
ans2[ans2<0.2923] = 0
ans2[5]

array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 1., 0.])

In [32]:
out_proba = lb.inverse_transform(ans2)
cols = [1,2,3]
out_proba = [' '.join(x) for x in out_proba]

exit = pd.DataFrame(out_proba, columns = ['genres'])
exit.index.names = ['id']
out_proba

['drama thriller crime',
 'drama',
 'drama',
 'drama',
 'action thriller',
 'drama romance',
 'drama comedy romance',
 'drama comedy romance',
 'drama',
 'drama action adventure',
 'drama',
 'drama action thriller',
 'drama comedy',
 'action thriller',
 'drama thriller',
 'drama action',
 'drama romance',
 'drama thriller',
 'thriller',
 'drama action',
 'drama',
 'thriller crime',
 'thriller',
 'drama',
 'drama thriller',
 'drama',
 'drama comedy',
 'drama',
 'comedy',
 'action thriller',
 'action thriller crime',
 'drama',
 'drama thriller crime',
 'action thriller sci-fi',
 'thriller adventure',
 'drama thriller',
 'drama thriller',
 'thriller crime',
 'drama thriller crime',
 'drama',
 'drama thriller',
 'drama thriller',
 'drama comedy romance',
 'drama thriller',
 'drama thriller',
 'drama comedy',
 'drama',
 'drama horror thriller',
 'drama action thriller',
 'drama thriller',
 'thriller',
 'drama romance',
 'drama comedy thriller',
 'drama action',
 'drama romance',
 'drama thr

In [33]:
exit3 = pd.DataFrame(out_proba, columns = ['genres'])
exit3.index.names = ['id']
exit3.to_csv('out5.csv', index=True)