In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

import numpy as np
import pandas as pd


def get_ids(qids):
    ids = []
    for t_ in qids:
        ids.append(int(t_[1:]))
    return np.asarray(ids)


def get_texts(file_path, question_path):
    qes = pd.read_csv(question_path)
    file = pd.read_csv(file_path)
    q1id, q2id = file['q1'], file['q2']
    id1s, id2s = get_ids(q1id), get_ids(q2id)
    all_words = qes['chars']
    texts1 = []
    texts2 = []
    for t_ in zip(id1s, id2s):
        texts1.append(all_words[t_[0]])
        texts2.append(all_words[t_[1]])
    return texts1,texts2


def make_submission(predict_prob):
    with open('submission.csv', 'w') as file:
        file.write(str('y_pre') + '\n')
        for line in predict_prob:
            file.write(str(line) + '\n')
    file.close()


TRAIN_PATH = 'C:\\Users\\Ax\\Desktop\\tqx\\mirror\\2b87f48d6d1e4c9c874f4b52fbb954e7_\\train.csv'
TEST_PATH = 'C:\\Users\\Ax\\Desktop\\tqx\\mirror\\2b87f48d6d1e4c9c874f4b52fbb954e7_\\test.csv'
QUESTION_PATH = 'C:\\Users\\Ax\\Desktop\\tqx\\mirror\\2b87f48d6d1e4c9c874f4b52fbb954e7_\\question.csv'

print('Load files...')
questions = pd.read_csv(QUESTION_PATH)
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)
corpus = questions['chars']

print('Fit the corpus...')
vec = TfidfVectorizer()
vec.fit(corpus)

print('Get texts...')
train_texts1,train_texts2 = get_texts(TRAIN_PATH, QUESTION_PATH)
test_texts1,test_texts2 = get_texts(TEST_PATH, QUESTION_PATH)

Load files...
Fit the corpus...
Get texts...


In [3]:
sub = pd.DataFrame({'question1': train_texts1, 'question2': train_texts2})
sub.to_csv('C:\\Users\\Ax\\Desktop\\w+c\\train_qwc.csv', index=False)
sub.head()

Unnamed: 0,question1,question2
0,L2218 L2568 L0360 L0242 L2218 L0741,L3019 L0104 L0582 L2218 L1861 L1556 L0242
1,L2376 L2168 L0050 L1187 L0104 L2432 L0902 L014...,L0156 L2452 L1187 L0104 L2459 L2979 L2613 L0449
2,L2323 L1526 L2214 L1132 L2723 L1861 L2249 L050...,L2568 L0971 L1291 L0358 L0037 L2582
3,L0018 L2321 L1346 L2432 L0902 L1149 L1980 L187...,L3019 L0104 L1104 L1935 L1683 L2495 L2812
4,L2271 L1346 L1389 L2932 L0466 L2218 L1971 L221...,L0050 L1187 L0104 L1683 L2495 L2812 L1588 L255...


In [4]:
df_train = pd.read_csv('C:\\Users\\Ax\\Desktop\\tqx\\mirror\\2b87f48d6d1e4c9c874f4b52fbb954e7_\\train.csv')
df_train.head()

Unnamed: 0,is_duplicate,q1,q2,id
0,1,Q397345,Q538594,0
1,0,Q193805,Q699273,1
2,0,Q085471,Q676160,2
3,0,Q189314,Q438123,3
4,0,Q267714,Q290126,4


In [5]:
is_duplicate=df_train['is_duplicate']
q1=df_train['q1']
q2=df_train['q2']
id=df_train['id']
sub = pd.DataFrame({'is_duplicate':is_duplicate,'q1':q1,'q2':q2,'question1': train_texts1, 'question2': train_texts2,'id':id})
sub.to_csv('C:\\Users\\Ax\\Desktop\\w+c\\train.csv',index=False)
sub.head()

Unnamed: 0,is_duplicate,q1,q2,question1,question2,id
0,1,Q397345,Q538594,L2218 L2568 L0360 L0242 L2218 L0741,L3019 L0104 L0582 L2218 L1861 L1556 L0242,0
1,0,Q193805,Q699273,L2376 L2168 L0050 L1187 L0104 L2432 L0902 L014...,L0156 L2452 L1187 L0104 L2459 L2979 L2613 L0449,1
2,0,Q085471,Q676160,L2323 L1526 L2214 L1132 L2723 L1861 L2249 L050...,L2568 L0971 L1291 L0358 L0037 L2582,2
3,0,Q189314,Q438123,L0018 L2321 L1346 L2432 L0902 L1149 L1980 L187...,L3019 L0104 L1104 L1935 L1683 L2495 L2812,3
4,0,Q267714,Q290126,L2271 L1346 L1389 L2932 L0466 L2218 L1971 L221...,L0050 L1187 L0104 L1683 L2495 L2812 L1588 L255...,4


In [6]:
sub = pd.DataFrame({'question1': test_texts1, 'question2': test_texts2})
sub.to_csv('C:\\Users\\Ax\\Desktop\\w+c\\test_qwc.csv', index=False)
sub.head()

Unnamed: 0,question1,question2
0,L0358 L0143 L0942 L1872 L1236 L3046 L0055 L258...,L1791 L2214 L1872 L1236 L0947 L2323
1,L2214 L1980 L1526 L2669 L0590 L2812 L0549 L000...,L1796 L2568 L0127 L0004 L0030 L2120 L2927
2,L2214 L1132 L2292 L0158 L3019 L0104 L0156 L0762,L2253 L1796 L2568 L0156 L0762 L1486 L2292 L1759
3,L2172 L1074 L0582 L2218 L1861 L2705 L1037,L2218 L1861 L0377 L2619
4,L2214 L1132 L2218 L1861 L1536 L0146 L0607 L186...,L2572 L0135 L0562 L0445 L1187 L0104 L2705 L125...


In [7]:
df_test = pd.read_csv('C:\\Users\\Ax\\Desktop\\tqx\\mirror\\2b87f48d6d1e4c9c874f4b52fbb954e7_\\test.csv')
df_test.head()

Unnamed: 0,q1,q2,test_id
0,Q017571,Q006012,0
1,Q728241,Q542572,1
2,Q166997,Q118270,2
3,Q422420,Q514386,3
4,Q354329,Q032485,4


In [8]:
test_id=df_test['test_id']
q1=df_test['q1']
q2=df_test['q2']
sub = pd.DataFrame({'test_id':test_id,'q1':q1,'q2':q2,'question1': test_texts1, 'question2': test_texts2})
sub.to_csv('C:\\Users\\Ax\\Desktop\\w+c\\test.csv',index=False)
sub.head()

Unnamed: 0,test_id,q1,q2,question1,question2
0,0,Q017571,Q006012,L0358 L0143 L0942 L1872 L1236 L3046 L0055 L258...,L1791 L2214 L1872 L1236 L0947 L2323
1,1,Q728241,Q542572,L2214 L1980 L1526 L2669 L0590 L2812 L0549 L000...,L1796 L2568 L0127 L0004 L0030 L2120 L2927
2,2,Q166997,Q118270,L2214 L1132 L2292 L0158 L3019 L0104 L0156 L0762,L2253 L1796 L2568 L0156 L0762 L1486 L2292 L1759
3,3,Q422420,Q514386,L2172 L1074 L0582 L2218 L1861 L2705 L1037,L2218 L1861 L0377 L2619
4,4,Q354329,Q032485,L2214 L1132 L2218 L1861 L1536 L0146 L0607 L186...,L2572 L0135 L0562 L0445 L1187 L0104 L2705 L125...


In [15]:
df_train = pd.read_csv('C:\\Users\\Ax\\Desktop\\chars\\train.csv')
df_train.head()

Unnamed: 0,is_duplicate,q1,q2,question1,question2,id
0,1,Q397345,Q538594,L2218 L2568 L0360 L0242 L2218 L0741,L3019 L0104 L0582 L2218 L1861 L1556 L0242,0
1,0,Q193805,Q699273,L2376 L2168 L0050 L1187 L0104 L2432 L0902 L014...,L0156 L2452 L1187 L0104 L2459 L2979 L2613 L0449,1
2,0,Q085471,Q676160,L2323 L1526 L2214 L1132 L2723 L1861 L2249 L050...,L2568 L0971 L1291 L0358 L0037 L2582,2
3,0,Q189314,Q438123,L0018 L2321 L1346 L2432 L0902 L1149 L1980 L187...,L3019 L0104 L1104 L1935 L1683 L2495 L2812,3
4,0,Q267714,Q290126,L2271 L1346 L1389 L2932 L0466 L2218 L1971 L221...,L0050 L1187 L0104 L1683 L2495 L2812 L1588 L255...,4


In [16]:
df_test = pd.read_csv('C:\\Users\\Ax\\Desktop\\chars\\test.csv')
df_test.head()

Unnamed: 0,test_id,q1,q2,question1,question2
0,0,Q017571,Q006012,L0358 L0143 L0942 L1872 L1236 L3046 L0055 L258...,L1791 L2214 L1872 L1236 L0947 L2323
1,1,Q728241,Q542572,L2214 L1980 L1526 L2669 L0590 L2812 L0549 L000...,L1796 L2568 L0127 L0004 L0030 L2120 L2927
2,2,Q166997,Q118270,L2214 L1132 L2292 L0158 L3019 L0104 L0156 L0762,L2253 L1796 L2568 L0156 L0762 L1486 L2292 L1759
3,3,Q422420,Q514386,L2172 L1074 L0582 L2218 L1861 L2705 L1037,L2218 L1861 L0377 L2619
4,4,Q354329,Q032485,L2214 L1132 L2218 L1861 L1536 L0146 L0607 L186...,L2572 L0135 L0562 L0445 L1187 L0104 L2705 L125...


In [54]:
question1=df_test['question1']

In [None]:
import os
lines=[]
for i in range(len(question1)):
    line = []
    for j in question1[i]: 
        line.append(j)
    lines.append(' '.join(line))


In [65]:
lines

['L 0 3 5 8   L 0 1 4 3   L 0 9 4 2   L 1 8 7 2   L 1 2 3 6   L 3 0 4 6   L 0 0 5 5   L 2 5 8 2   L 2 9 2 7',
 'L 2 2 1 4   L 1 9 8 0   L 1 5 2 6   L 2 6 6 9   L 0 5 9 0   L 2 8 1 2   L 0 5 4 9   L 0 0 0 4   L 0 8 8 2   L 1 1 3 2   L 0 0 3 0   L 2 1 2 0',
 'L 2 2 1 4   L 1 1 3 2   L 2 2 9 2   L 0 1 5 8   L 3 0 1 9   L 0 1 0 4   L 0 1 5 6   L 0 7 6 2',
 'L 2 1 7 2   L 1 0 7 4   L 0 5 8 2   L 2 2 1 8   L 1 8 6 1   L 2 7 0 5   L 1 0 3 7',
 'L 2 2 1 4   L 1 1 3 2   L 2 2 1 8   L 1 8 6 1   L 1 5 3 6   L 0 1 4 6   L 0 6 0 7   L 1 8 6 1   L 0 1 4 3   L 1 3 8 9   L 2 9 3 2',
 'L 3 0 3 2   L 2 4 0 3   L 3 0 1 9   L 0 1 0 4   L 0 5 1 0   L 1 3 3 1   L 1 3 3 1   L 0 0 1 6   L 1 1 3 2   L 2 4 1 4   L 1 0 6 1   L 0 6 3 3   L 0 3 6 2',
 'L 2 5 6 8   L 2 2 9 2   L 0 1 5 8   L 1 1 2 8   L 0 1 4 3   L 0 5 0 7',
 'L 2 2 7 2   L 1 5 5 4   L 1 7 4 2   L 0 5 7 8   L 0 9 8 4   L 0 1 4 3   L 0 5 0 7   L 2 5 8 2',
 'L 0 1 4 3   L 1 6 3 5   L 1 9 7 1   L 2 5 8 2   L 1 7 7 1   L 0 1 9 3   L 1 4 6 4   L 0 2 4 2 